1/* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef XCC
22# define XCC    xcc
23#endif
24	.register	%g2,#scratch
25	.register	%g3,#scratch
26	.register	%g6,#scratch
27
28#define	FPRS_FEF	0x04
29
30/*
31 * ASI_STBI_P marks the cache line as "least recently used"
32 * which means if many threads are active, it has a high chance
33 * of being pushed out of the cache between the first initializing
34 * store and the final stores.
35 * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
36 * cache line as "most recently used" for all but the last cache
37 * line.
38 */
39
40#define	ASI_BLK_INIT_QUAD_LDD_P	0xe2
41#define	ASI_ST_BLK_INIT_MRU_P	0xf2
42
43#define	ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
44#define	ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P
45
46#define	BLOCK_SIZE	64	/* L2 data cache line size  */
47#define	SHORTCOPY	3
48#define	SHORTCHECK	14
49#define	SHORT_LONG	64	/* max copy for short longword-aligned case  */
50				/* must be at least 64  */
51#define	SMALL_MAX	255	/* max small copy for word/long aligned  */
52#define	SMALL_UMAX	128	/* max small copy for unaligned case  */
53#define	MED_WMAX	1023	/* max copy for medium word-aligned case  */
54#define	MED_MAX		511	/* max copy for medium longword-aligned case  */
55#define	ST_CHUNK	20	/* ST_CHUNK - block of values for BIS Store  */
56/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
57 * prefetch 20 can cause inst pipeline to delay if data is in memory
58 * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache  */
59#define	ALIGN_PRE	20	/* distance for aligned prefetch loop  */
60
61#define EX_ST(x)	x
62#define EX_RETVAL(x)	x
63#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
64#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
65
66#if IS_IN (libc)
67
68	.text
69
70ENTRY(__memmove_niagara7)
71	/* %o0=dst, %o1=src, %o2=len */
72	cmp	%o1, %o0	/* if from address is >= to use forward copy  */
73	bgeu,pn	%XCC, .Lforcpy	/* else use backward if ...  */
74	 sub	%o0, %o1, %o4	/* get difference of two addresses  */
75	cmp	%o2, %o4	/* compare size and difference of addresses  */
76	bleu,pn	%XCC, .Lforcpy	/* if size is bigger, do overlapped copy  */
77	 add	%o1, %o2, %o5	/* get to end of source space  */
78
79/* an overlapped copy that must be done "backwards"  */
80.Lchksize:
81	cmp	%o2, 8			/* less than 8 byte do byte copy  */
82	blu,pn %XCC, 2f			/* else continue  */
83
84/* Now size is bigger than 8  */
85.Ldbalign:
86	 add	%o0, %o2, %g1		/* get to end of dest space  */
87	andcc	%g1, 7, %o3		/* %o3 has cnt til dst 8 byte align  */
88	bz,a,pn	%XCC, .Ldbbck		/* skip if dst is 8 byte aligned  */
89	 andn	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
90	sub	%o2, %o3, %o2		/* update o2 with new count  */
91
921:	dec	%o5			/* decrement source  */
93	ldub	[%o5], %g1		/* load one byte  */
94	deccc	%o3			/* decrement count  */
95	bgu,pt	%XCC, 1b		/* if not done keep copying  */
96	 stb	%g1, [%o5+%o4]		/* store one byte into dest  */
97	andncc	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
98	bz,pn	%XCC, 2f		/* if size < 8, move to byte copy  */
99
100/* Now Destination is 8 byte aligned  */
101.Ldbbck:
102	 andcc	%o5, 7, %o0		/* %o0 has src offset  */
103	bz,a,pn	%XCC, .Ldbcopybc	/* if src is aligned do fast memmove  */
104	 sub	%o2, %o3, %o2		/* Residue bytes in %o2  */
105
106.Lcpy_dbwdbc:				/* alignment of src is needed  */
107	sub	%o2, 8, %o2		/* set size one loop ahead  */
108	sll	%o0, 3, %g1		/* %g1 is left shift  */
109	mov	64, %g5			/* init %g5 to be 64  */
110	sub	%g5, %g1, %g5		/* %g5 rightshift = (64 - leftshift)  */
111	sub	%o5, %o0, %o5		/* align the src at 8 bytes.  */
112	add	%o4, %o0, %o4		/* increase diff between src & dst  */
113	ldx	[%o5], %o1		/* load first 8 bytes  */
114	srlx	%o1, %g5, %o1
1151:	sub	%o5, 8, %o5		/* subtract 8 from src  */
116	ldx	[%o5], %o0		/* load 8 byte  */
117	sllx	%o0, %g1, %o3		/* shift loaded val left to tmp reg  */
118	or	%o1, %o3, %o3		/* align data  */
119	stx	%o3, [%o5+%o4]		/* store 8 byte  */
120	subcc	%o2, 8, %o2		/* subtract 8 byte from size  */
121	bg,pt	%XCC, 1b		/* if size > 0 continue  */
122	 srlx	%o0, %g5, %o1		/* move extra byte for the next use  */
123
124	srl	%g1, 3, %o0		/* restore %o0 value for alignment  */
125	add	%o5, %o0, %o5		/* restore src alignment  */
126	sub	%o4, %o0, %o4		/* restore diff between src & dest  */
127
128	ba	2f			/* branch to the trailing byte copy  */
129	 add	%o2, 8, %o2		/* restore size value  */
130
131.Ldbcopybc:				/* alignment of src is not needed  */
1321:	sub	%o5, 8, %o5		/* subtract from src  */
133	ldx	[%o5], %g1		/* load 8 bytes  */
134	subcc	%o3, 8, %o3		/* subtract from size  */
135	bgu,pt	%XCC, 1b		/* if size is bigger 0 continue  */
136	 stx	%g1, [%o5+%o4]		/* store 8 bytes to destination  */
137
138	ba	2f
139	 nop
140
141.Lbcbyte:
1421:	ldub	[%o5], %g1		/* load one byte  */
143	stb	%g1, [%o5+%o4]		/* store one byte  */
1442:	deccc	%o2			/* decrement size  */
145	bgeu,a,pt %XCC, 1b		/* if size is >= 0 continue  */
146	 dec	%o5			/* decrement from address  */
147
148.Lexitbc:				/* exit from backward copy  */
149	retl
150	 add	%o5, %o4, %o0		/* restore dest addr  */
151
152
153/* Check to see if memmove is large aligned copy
154 * If so, use special version of copy that avoids
155 * use of block store init.  */
156.Lforcpy:
157	cmp	%o2, SMALL_MAX		/* check for not small case  */
158	blt,pn	%XCC, .Lmv_short	/* merge with memcpy  */
159	 mov	%o0, %g1		/* save %o0  */
160	neg	%o0, %o5
161	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
162	brz,pt	%o5, .Lmv_dst_aligned_on_8
163
164/* %o5 has the bytes to be written in partial store.  */
165	 sub	%o2, %o5, %o2
166	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
1677:					/* dst aligning loop  */
168	ldub	[%o1+%o0], %o4		/* load one byte  */
169	subcc	%o5, 1, %o5
170	stb	%o4, [%o0]
171	bgu,pt	%XCC, 7b
172	 add	%o0, 1, %o0		/* advance dst  */
173	add	%o1, %o0, %o1		/* restore %o1  */
174.Lmv_dst_aligned_on_8:
175	andcc	%o1, 7, %o5
176	brnz,pn	%o5, .Lsrc_dst_unaligned_on_8
177	 prefetch [%o1 + (1 * BLOCK_SIZE)], 20
178
179.Lmv_src_dst_aligned_on_8:
180/* check if we are copying MED_MAX or more bytes  */
181	cmp	%o2, MED_MAX		/* limit to store buffer size  */
182	bleu,pt	%XCC, .Lmedlong
183	 prefetch [%o1 + (2 * BLOCK_SIZE)], 20
184
185/* The mv_align loop below mimics the memcpy code for large aligned copies,
186 * but does not use the ASI_STBI_P (block initializing store) performance
187 * optimization.  This is used when memcpy is incorrectly invoked with
188 * overlapping buffers.  */
189
190.Lmv_large_align8_copy:			/* Src and dst share 8 byte align  */
191					/* align dst to 64 byte boundary  */
192	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
193	brz,pn	%o3, .Lmv_aligned_on_64
194	 sub	%o3, 64, %o3		/* %o3 has negative bytes to move  */
195	add	%o2, %o3, %o2		/* adjust remaining count  */
196.Lmv_align_to_64:
197	ldx	[%o1], %o4
198	add	%o1, 8, %o1		/* increment src ptr  */
199	addcc	%o3, 8, %o3
200	stx	%o4, [%o0]
201	brnz,pt	%o3, .Lmv_align_to_64
202	 add	%o0, 8, %o0		/* increment dst ptr  */
203
204.Lmv_aligned_on_64:
205	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
206	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
207.Lmv_align_loop:
208	ldx	[%o1],%o4
209	stx	%o4,[%o0]
210	prefetch [%o0 + (10 * BLOCK_SIZE)], 22
211	prefetch [%o1 + (10 * BLOCK_SIZE)], 21
212	subcc	%o5, 64, %o5
213	ldx	[%o1+8],%o4
214	stx	%o4,[%o0+8]
215	ldx	[%o1+16],%o4
216	stx	%o4,[%o0+16]
217	ldx	[%o1+24],%o4
218	stx	%o4,[%o0+24]
219	ldx	[%o1+32],%o4
220	stx	%o4,[%o0+32]
221	ldx	[%o1+40],%o4
222	stx	%o4,[%o0+40]
223	ldx	[%o1+48],%o4
224	add	%o1, 64, %o1
225	stx	%o4,[%o0+48]
226	add	%o0, 64, %o0
227	ldx	[%o1-8],%o4
228	bgt,pt	%XCC, .Lmv_align_loop
229	 stx	%o4,[%o0-8]
230
231	ba	.Lmedlong
232	 nop
233END(__memmove_niagara7)
234
235ENTRY(__mempcpy_niagara7)
236	/* %o0=dst, %o1=src, %o2=len */
237	ba,pt	%icc, 101f
238	 add	%o0, %o2, %g1		/* save dst + len  */
239END(__mempcpy_niagara7)
240
241	.align	32
242ENTRY(__memcpy_niagara7)
243100:	/* %o0=dst, %o1=src, %o2=len */
244	mov	%o0, %g1		/* save %o0  */
245101:
246#ifndef __arch64__
247	srl	%o2, 0, %o2
248#endif
249	cmp	%o2, SMALL_MAX		/* check for not small case  */
250	bgeu,pn	%XCC, .Lmedium		/* go to larger cases  */
251.Lmv_short:
252	 cmp	%o2, SHORTCOPY		/* check for really short case  */
253	ble,pn	%XCC, .Lsmallfin
254	 or	%o0, %o1, %o4		/* prepare alignment check  */
255	andcc	%o4, 0x3, %o5		/* test for word alignment  */
256	bnz,pn	%XCC, .Lsmallunalign	/* branch to non-word aligned case  */
257	 nop
258	subcc	%o2, 7, %o2		/* adjust count  */
259	ble,pn	%XCC, .Lsmallwordx
260	 andcc	%o4, 0x7, %o5		/* test for long alignment  */
261/* 8 or more bytes, src and dest start on word boundary
262 * %o4 contains or %o0, %o1  */
263.Lsmalllong:
264	bnz,pn	%XCC, .Lsmallwords	/* branch to word aligned case  */
265	 cmp	%o2, SHORT_LONG-7
266	bge,a	%XCC, .Lmedl64		/* if we branch  */
267	 sub	%o2,56,%o2		/* adjust %o2 to -63 off count  */
268
269/* slightly unroll the small_long_loop to improve very short copies  */
270	cmp	%o2, 32-7
271	blt,a,pn %XCC, .Lsmall_long_l
272	 sub	%o1, %o0, %o1		/* %o1 gets the difference  */
273
274	ldx	[%o1], %o5
275	ldx	[%o1+8], %o4
276	ldx	[%o1+16], %o3
277
278	subcc	%o2, 24, %o2
279	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
280
281	stx	%o5, [%o0]		/* write word  */
282	stx	%o4, [%o0+8]		/* write word  */
283	stx	%o3, [%o0+16]		/* write word  */
284
285	add	%o0, 24, %o0
286
287/* end loop unroll  */
288
289.Lsmall_long_l:
290	ldx	[%o1+%o0], %o3
291	subcc	%o2, 8, %o2
292	add	%o0, 8, %o0
293	bgu,pn	%XCC, .Lsmall_long_l	/* loop until done  */
294	 stx	%o3, [%o0-8]		/* write word  */
295	addcc	%o2, 7, %o2		/* restore %o2 to correct count  */
296	bnz,pn	%XCC, .Lsmall_long_x	/* check for completion  */
297	 add	%o1, %o0, %o1		/* restore %o1  */
298	retl
299	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
300.Lsmall_long_x:
301	cmp	%o2, 4			/* check for 4 or more bytes left  */
302	blt,pn	%XCC, .Lsmallleft3	/* if not, go to finish up  */
303	 nop
304	lduw	[%o1], %o3
305	add	%o1, 4, %o1
306	subcc	%o2, 4, %o2
307	stw	%o3, [%o0]
308	bnz,pn	%XCC, .Lsmallleft3
309	 add	%o0, 4, %o0
310	retl
311	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
312
313	.align 32
314/* src and dest start on word boundary; 7 or fewer bytes  */
315.Lsmallwordx:
316	lduw	[%o1], %o3		/* read word  */
317	addcc	%o2, 3, %o2		/* restore count  */
318	bz,pt	%XCC, .Lsmallexit
319	 stw	%o3, [%o0]		/* write word  */
320	deccc	%o2			/* reduce count for cc test  */
321	ldub	[%o1+4], %o3		/* load one byte  */
322	bz,pt	%XCC, .Lsmallexit
323	 stb	%o3, [%o0+4]		/* store one byte  */
324	ldub	[%o1+5], %o3		/* load second byte  */
325	deccc	%o2
326	bz,pt	%XCC, .Lsmallexit
327	 stb	%o3, [%o0+5]		/* store second byte  */
328	ldub	[%o1+6], %o3		/* load third byte  */
329	stb	%o3, [%o0+6]		/* store third byte  */
330.Lsmallexit:
331	retl
332	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
333
334	.align 32
335.Lsmallunalign:
336	cmp	%o2, SHORTCHECK
337	ble,pn	%XCC, .Lsmallrest
338	 cmp	%o2, SMALL_UMAX
339	bge,pt	%XCC, .Lmedium_join
340	 andcc	%o1, 0x3, %o5		/* is src word aligned  */
341	bz,pn	%XCC, .Laldst
342	 cmp	%o5, 2			/* is src half-word aligned  */
343	be,pt	%XCC, .Ls2algn
344	 cmp	%o5, 3			/* src is byte aligned  */
345.Ls1algn:
346	ldub	[%o1], %o3		/* move 1 or 3 bytes to align it  */
347	inc	1, %o1
348	stb	%o3, [%o0]		/* move a byte to align src  */
349	inc	1, %o0
350	bne,pt	%XCC, .Ls2algn
351	 dec	%o2
352	b	.Lald			/* now go align dest  */
353	 andcc	%o0, 0x3, %o5
354
355.Ls2algn:
356	lduh	[%o1], %o3		/* know src is 2 byte aligned  */
357	inc	2, %o1
358	srl	%o3, 8, %o4
359	stb	%o4, [%o0]		/* have to do bytes,  */
360	stb	%o3, [%o0 + 1]		/* do not know dst alignment  */
361	inc	2, %o0
362	dec	2, %o2
363
364.Laldst:
365	andcc	%o0, 0x3, %o5		/* align the destination address  */
366.Lald:
367	bz,pn	%XCC, .Lw4cp
368	 cmp	%o5, 2
369	be,pn	%XCC, .Lw2cp
370	 cmp	%o5, 3
371.Lw3cp:	lduw	[%o1], %o4
372	inc	4, %o1
373	srl	%o4, 24, %o5
374	stb	%o5, [%o0]
375	bne,pt	%XCC, .Lw1cp
376	 inc	%o0
377	dec	1, %o2
378	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
379	dec	4, %o3			/* avoid reading beyond tail of src  */
380	sub	%o1, %o0, %o1		/*  %o1 gets the difference  */
381
3821:	sll	%o4, 8, %g5		/* save residual bytes  */
383	lduw	[%o1+%o0], %o4
384	deccc	4, %o3
385	srl	%o4, 24, %o5		/* merge with residual  */
386	or	%o5, %g5, %g5
387	st	%g5, [%o0]
388	bnz,pt	%XCC, 1b
389	 inc	4, %o0
390	sub	%o1, 3, %o1		/* used one byte of last word read  */
391	and	%o2, 3, %o2
392	b	7f
393	 inc	4, %o2
394
395.Lw1cp:	srl	%o4, 8, %o5
396	sth	%o5, [%o0]
397	inc	2, %o0
398	dec	3, %o2
399	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
400	dec	4, %o3			/* avoid reading beyond tail of src  */
401	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
402
4032:	sll	%o4, 24, %g5		/* save residual bytes  */
404	lduw	[%o1+%o0], %o4
405	deccc	4, %o3
406	srl	%o4, 8, %o5		/* merge with residual  */
407	or	%o5, %g5, %g5
408	st	%g5, [%o0]
409	bnz,pt	%XCC, 2b
410	 inc	4, %o0
411	sub	%o1, 1, %o1		/* used 3 bytes of last word read  */
412	and	%o2, 3, %o2
413	b	7f
414	 inc	4, %o2
415
416.Lw2cp:	lduw	[%o1], %o4
417	inc	4, %o1
418	srl	%o4, 16, %o5
419	sth	%o5, [%o0]
420	inc	2, %o0
421	dec	2, %o2
422	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
423	dec	4, %o3			/* avoid reading beyond tail of src  */
424	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
425
4263:	sll	%o4, 16, %g5		/* save residual bytes  */
427	lduw	[%o1+%o0], %o4
428	deccc	4, %o3
429	srl	%o4, 16, %o5		/* merge with residual  */
430	or	%o5, %g5, %g5
431	st	%g5, [%o0]
432	bnz,pt	%XCC, 3b
433	 inc	4, %o0
434	sub	%o1, 2, %o1		/* used two bytes of last word read  */
435	and	%o2, 3, %o2
436	b	7f
437	 inc	4, %o2
438
439.Lw4cp:	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
440	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
441
4421:	lduw	[%o1+%o0], %o4		/* read from address  */
443	deccc	4, %o3			/* decrement count  */
444	st	%o4, [%o0]		/* write at destination address  */
445	bgu,pt	%XCC, 1b
446	 inc	4, %o0			/* increment to address  */
447	and	%o2, 3, %o2		/* number of leftover bytes, if any  */
448
449	/* simple finish up byte copy, works with any alignment  */
4507:
451	add	%o1, %o0, %o1		/* restore %o1  */
452.Lsmallrest:
453	tst	%o2
454	bz,pt	%XCC, .Lsmallx
455	 cmp	%o2, 4
456	blt,pn	%XCC, .Lsmallleft3
457	 nop
458	sub	%o2, 3, %o2
459.Lsmallnotalign4:
460	ldub	[%o1], %o3		/* read byte  */
461	subcc	%o2, 4, %o2		/* reduce count by 4  */
462	stb	%o3, [%o0]		/* write byte  */
463	ldub	[%o1+1], %o3		/* repeat for total of 4 bytes  */
464	add	%o1, 4, %o1		/* advance SRC by 4  */
465	stb	%o3, [%o0+1]
466	ldub	[%o1-2], %o3
467	add	%o0, 4, %o0		/* advance DST by 4  */
468	stb	%o3, [%o0-2]
469	ldub	[%o1-1], %o3
470	bgu,pt	%XCC, .Lsmallnotalign4	/* loop til 3 or fewer bytes remain  */
471	 stb	%o3, [%o0-1]
472	addcc	%o2, 3, %o2		/* restore count  */
473	bz,pt	%XCC, .Lsmallx
474.Lsmallleft3:				/* 1, 2, or 3 bytes remain  */
475	 subcc	%o2, 1, %o2
476	ldub	[%o1], %o3		/* load one byte  */
477	bz,pt	%XCC, .Lsmallx
478	 stb	%o3, [%o0]		/* store one byte  */
479	ldub	[%o1+1], %o3		/* load second byte  */
480	subcc	%o2, 1, %o2
481	bz,pt	%XCC, .Lsmallx
482	 stb	%o3, [%o0+1]		/* store second byte  */
483	ldub	[%o1+2], %o3		/* load third byte  */
484	stb	%o3, [%o0+2]		/* store third byte  */
485.Lsmallx:
486	retl
487	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
488
489.Lsmallfin:
490	tst	%o2
491	bnz,pn	%XCC, .Lsmallleft3
492	 nop
493	retl
494	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
495
496	.align 16
497.Lsmallwords:
498	lduw	[%o1], %o3		/* read word  */
499	subcc	%o2, 8, %o2		/* update count  */
500	stw	%o3, [%o0]		/* write word  */
501	add	%o1, 8, %o1		/* update SRC  */
502	lduw	[%o1-4], %o3		/* read word  */
503	add	%o0, 8, %o0		/* update DST  */
504	bgu,pt	%XCC, .Lsmallwords	/* loop until done  */
505	 stw	%o3, [%o0-4]		/* write word  */
506	addcc	%o2, 7, %o2		/* restore count  */
507	bz,pt	%XCC, .Lsmallexit	/* check for completion  */
508	 cmp	%o2, 4			/* check for 4 or more bytes left  */
509	blt,pt	%XCC, .Lsmallleft3	/* if not, go to finish up  */
510	 nop
511	lduw	[%o1], %o3
512	add	%o1, 4, %o1
513	subcc	%o2, 4, %o2
514	add	%o0, 4, %o0
515	bnz,pn	%XCC, .Lsmallleft3
516	 stw	%o3, [%o0-4]
517	retl
518	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
519
520	.align 16
521.Lmedium:
522.Lmedium_join:
523	neg	%o0, %o5
524	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
525	brz,pt	%o5, .Ldst_aligned_on_8
526
527	/* %o5 has the bytes to be written in partial store.  */
528	 sub	%o2, %o5, %o2
529	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
5307:					/* dst aligning loop  */
531	ldub	[%o1+%o0], %o4		/* load one byte  */
532	subcc	%o5, 1, %o5
533	stb	%o4, [%o0]
534	bgu,pt	%XCC, 7b
535	 add	%o0, 1, %o0		/* advance dst  */
536	add	%o1, %o0, %o1		/* restore %o1  */
537.Ldst_aligned_on_8:
538	andcc	%o1, 7, %o5
539	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
540	 nop
541
542.Lsrc_dst_aligned_on_8:
543	/* check if we are copying MED_MAX or more bytes  */
544	cmp	%o2, MED_MAX		/* limit to store buffer size  */
545	bgu,pn	%XCC, .Llarge_align8_copy
546	 nop
547/*
548 * Special case for handling when src and dest are both long word aligned
549 * and total data to move is less than MED_MAX bytes
550 */
551.Lmedlong:
552	subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
553	ble,pn	%XCC, .Lmedl63		/* skip big loop if < 64 bytes  */
554	 nop
555.Lmedl64:
556	ldx	[%o1], %o4		/* load  */
557	subcc	%o2, 64, %o2		/* decrement length count  */
558	stx	%o4, [%o0]		/* and store  */
559	ldx	[%o1+8], %o3		/* a block of 64 bytes  */
560	stx	%o3, [%o0+8]
561	ldx	[%o1+16], %o4
562	stx	%o4, [%o0+16]
563	ldx	[%o1+24], %o3
564	stx	%o3, [%o0+24]
565	ldx	[%o1+32], %o4		/* load  */
566	stx	%o4, [%o0+32]		/* and store  */
567	ldx	[%o1+40], %o3		/* a block of 64 bytes  */
568	add	%o1, 64, %o1		/* increase src ptr by 64  */
569	stx	%o3, [%o0+40]
570	ldx	[%o1-16], %o4
571	add	%o0, 64, %o0		/* increase dst ptr by 64  */
572	stx	%o4, [%o0-16]
573	ldx	[%o1-8], %o3
574	bgu,pt	%XCC, .Lmedl64		/* repeat if at least 64 bytes left  */
575	 stx	%o3, [%o0-8]
576.Lmedl63:
577	addcc	%o2, 32, %o2		/* adjust remaining count  */
578	ble,pt	%XCC, .Lmedl31		/* to skip if 31 or fewer bytes left  */
579	 nop
580	ldx	[%o1], %o4		/* load  */
581	sub	%o2, 32, %o2		/* decrement length count  */
582	stx	%o4, [%o0]		/* and store  */
583	ldx	[%o1+8], %o3		/* a block of 32 bytes  */
584	add	%o1, 32, %o1		/* increase src ptr by 32  */
585	stx	%o3, [%o0+8]
586	ldx	[%o1-16], %o4
587	add	%o0, 32, %o0		/* increase dst ptr by 32  */
588	stx	%o4, [%o0-16]
589	ldx	[%o1-8], %o3
590	stx	%o3, [%o0-8]
591.Lmedl31:
592	addcc	%o2, 16, %o2		/* adjust remaining count  */
593	ble,pt	%XCC, .Lmedl15		/* skip if 15 or fewer bytes left  */
594	 nop
595	ldx	[%o1], %o4		/* load and store 16 bytes  */
596	add	%o1, 16, %o1		/* increase src ptr by 16  */
597	stx	%o4, [%o0]
598	sub	%o2, 16, %o2		/* decrease count by 16  */
599	ldx	[%o1-8], %o3
600	add	%o0, 16, %o0		/* increase dst ptr by 16  */
601	stx	%o3, [%o0-8]
602.Lmedl15:
603	addcc	%o2, 15, %o2		/* restore count  */
604	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
605	 cmp	%o2, 8
606	blt,pt	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
607	 tst	%o2
608	ldx	[%o1], %o4		/* load 8 bytes  */
609	add	%o1, 8, %o1		/* increase src ptr by 8  */
610	add	%o0, 8, %o0		/* increase dst ptr by 8  */
611	subcc	%o2, 8, %o2		/* decrease count by 8  */
612	bnz,pn	%XCC, .Lmedw7
613	 stx	%o4, [%o0-8]		/* and store 8 bytes  */
614	retl
615	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
616
617	.align 16
618.Lsrc_dst_unaligned_on_8:
619	/* DST is 8-byte aligned, src is not  */
620	andcc	%o1, 0x3, %o5		/* test word alignment  */
621	bnz,pt	%XCC, .Lunalignsetup	/* branch if not word aligned  */
622	 nop
623
624/*
625 * Handle all cases where src and dest are aligned on word
626 * boundaries. Use unrolled loops for better performance.
627 * This option wins over standard large data move when
628 * source and destination is in cache for medium
629 * to short data moves.
630 */
631	cmp %o2, MED_WMAX		/* limit to store buffer size  */
632	bge,pt	%XCC, .Lunalignrejoin	/* otherwise rejoin main loop  */
633	 nop
634
635	subcc	%o2, 31, %o2		/* adjust length to allow cc test  */
636					/* for end of loop  */
637	ble,pt	%XCC, .Lmedw31		/* skip big loop if less than 16  */
638.Lmedw32:
639	 ld	[%o1], %o4		/* move a block of 32 bytes  */
640	sllx	%o4, 32, %o5
641	ld	[%o1+4], %o4
642	or	%o4, %o5, %o5
643	stx	%o5, [%o0]
644	subcc	%o2, 32, %o2		/* decrement length count  */
645	ld	[%o1+8], %o4
646	sllx	%o4, 32, %o5
647	ld	[%o1+12], %o4
648	or	%o4, %o5, %o5
649	stx	%o5, [%o0+8]
650	add	%o1, 32, %o1		/* increase src ptr by 32  */
651	ld	[%o1-16], %o4
652	sllx	%o4, 32, %o5
653	ld	[%o1-12], %o4
654	or	%o4, %o5, %o5
655	stx	%o5, [%o0+16]
656	add	%o0, 32, %o0		/* increase dst ptr by 32  */
657	ld	[%o1-8], %o4
658	sllx	%o4, 32, %o5
659	ld	[%o1-4], %o4
660	or	%o4, %o5, %o5
661	bgu,pt	%XCC, .Lmedw32		/* repeat if at least 32 bytes left  */
662	 stx	%o5, [%o0-8]
663.Lmedw31:
664	addcc	%o2, 31, %o2		/* restore count  */
665	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
666	 cmp	%o2, 16
667	blt,pt	%XCC, .Lmedw15
668	 nop
669	ld	[%o1], %o4		/* move a block of 16 bytes  */
670	sllx	%o4, 32, %o5
671	subcc	%o2, 16, %o2		/* decrement length count  */
672	ld	[%o1+4], %o4
673	or	%o4, %o5, %o5
674	stx	%o5, [%o0]
675	add	%o1, 16, %o1		/* increase src ptr by 16  */
676	ld	[%o1-8], %o4
677	add	%o0, 16, %o0		/* increase dst ptr by 16  */
678	sllx	%o4, 32, %o5
679	ld	[%o1-4], %o4
680	or	%o4, %o5, %o5
681	stx	%o5, [%o0-8]
682.Lmedw15:
683	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
684	 cmp	%o2, 8
685	blt,pn	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
686	 tst	%o2
687	ld	[%o1], %o4		/* load 4 bytes  */
688	subcc	%o2, 8, %o2		/* decrease count by 8  */
689	stw	%o4, [%o0]		/* and store 4 bytes  */
690	add	%o1, 8, %o1		/* increase src ptr by 8  */
691	ld	[%o1-4], %o3		/* load 4 bytes  */
692	add	%o0, 8, %o0		/* increase dst ptr by 8  */
693	stw	%o3, [%o0-4]		/* and store 4 bytes  */
694	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
695.Lmedw7:				/* count is ge 1, less than 8  */
696	 cmp	%o2, 4			/* check for 4 bytes left  */
697	blt,pn	%XCC, .Lsmallleft3	/* skip if 3 or fewer bytes left  */
698	 nop
699	ld	[%o1], %o4		/* load 4 bytes  */
700	add	%o1, 4, %o1		/* increase src ptr by 4  */
701	add	%o0, 4, %o0		/* increase dst ptr by 4  */
702	subcc	%o2, 4, %o2		/* decrease count by 4  */
703	bnz,pt	%XCC, .Lsmallleft3
704	 stw	%o4, [%o0-4]		/* and store 4 bytes  */
705	retl
706	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
707
708	.align 16
709.Llarge_align8_copy:			/* Src and dst 8 byte aligned  */
710	/* align dst to 64 byte boundary  */
711	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
712	brz,pn	%o3, .Laligned_to_64
713	 andcc	%o0, 8, %o3		/* odd long words to move?  */
714	brz,pt	%o3, .Laligned_to_16
715	 nop
716	ldx	[%o1], %o4
717	sub	%o2, 8, %o2
718	add	%o1, 8, %o1		/* increment src ptr  */
719	add	%o0, 8, %o0		/* increment dst ptr  */
720	stx	%o4, [%o0-8]
721.Laligned_to_16:
722	andcc	%o0, 16, %o3		/* pair of long words to move?  */
723	brz,pt	%o3, .Laligned_to_32
724	 nop
725	ldx	[%o1], %o4
726	sub	%o2, 16, %o2
727	stx	%o4, [%o0]
728	add	%o1, 16, %o1		/* increment src ptr  */
729	ldx	[%o1-8], %o4
730	add	%o0, 16, %o0		/* increment dst ptr  */
731	stx	%o4, [%o0-8]
732.Laligned_to_32:
733	andcc	%o0, 32, %o3		/* four long words to move?  */
734	brz,pt	%o3, .Laligned_to_64
735	 nop
736	ldx	[%o1], %o4
737	sub	%o2, 32, %o2
738	stx	%o4, [%o0]
739	ldx	[%o1+8], %o4
740	stx	%o4, [%o0+8]
741	ldx	[%o1+16], %o4
742	stx	%o4, [%o0+16]
743	add	%o1, 32, %o1		/* increment src ptr  */
744	ldx	[%o1-8], %o4
745	add	%o0, 32, %o0		/* increment dst ptr  */
746	stx	%o4, [%o0-8]
747.Laligned_to_64:
748/*	Following test is included to avoid issues where existing executables
749 *	incorrectly call memcpy with overlapping src and dest instead of memmove
750 *
751 *	if ( (src ge dst) and (dst+len > src)) go to overlap case
752 *	if ( (src lt dst) and (src+len > dst)) go to overlap case
753 */
754	cmp	%o1,%o0
755	bge,pt	%XCC, 1f
756	 nop
757/*				src+len > dst?  */
758	add	%o1, %o2, %o4
759	cmp	%o4, %o0
760	bgt,pt	%XCC, .Lmv_aligned_on_64
761	 nop
762	ba	2f
763	 nop
7641:
765/*				dst+len > src?  */
766	add	%o0, %o2, %o4
767	cmp	%o4, %o1
768	bgt,pt	%XCC, .Lmv_aligned_on_64
769	 nop
7702:
771/*	handle non-overlapped copies
772 *
773 *	Using block init store (BIS) instructions to avoid fetching cache
774 *	lines from memory. Use ST_CHUNK stores to first element of each cache
775 *	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
776 *	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
777 */
778	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
779	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
780
781/*	We use ASI_STBIMRU_P for the first store to each cache line
782 *	followed by ASI_STBI_P (mark as LRU) for the last store. That
783 *	mixed approach reduces the chances the cache line is removed
784 *	before we finish setting it, while minimizing the effects on
785 *	other cached values during a large memcpy
786 *
787 *	Intermediate stores can be normal since first BIS activates the
788 *	cache line in the L2 cache.
789 *
790 *	ST_CHUNK batches up initial BIS operations for several cache lines
791 *	to allow multiple requests to not be blocked by overflowing the
792 *	the store miss buffer. Then the matching stores for all those
793 *	BIS operations are executed.
794 */
795
796.Lalign_loop:
797	cmp	%o5, ST_CHUNK*64
798	blu,pt	%XCC, .Lalign_short
799	 mov	ST_CHUNK, %o3
800	sllx	%o3, 6, %g5		/* ST_CHUNK*64  */
801
802.Lalign_loop_start:
803	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
804	subcc	%o3, 2, %o3
805	ldx	[%o1], %o4
806	add	%o1, 128, %o1
807	EX_ST(STORE_ASI(%o4, %o0))
808	add	%o0, 64, %o0
809	ldx	[%o1-64], %o4
810	EX_ST(STORE_ASI(%o4, %o0))
811	add	%o0, 64, %o0
812	bgu,pt	%XCC, .Lalign_loop_start
813	 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21
814
815	mov	ST_CHUNK, %o3
816	sub	%o1, %g5, %o1		/* reset %o1  */
817	sub	%o0, %g5, %o0		/* reset %o0  */
818
819	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
820.Lalign_loop_rest:
821	ldx	[%o1+8],%o4
822	add	%o0, 64, %o0
823	stx	%o4, [%o0-48]
824	subcc	%o3, 1, %o3
825	ldx	[%o1+16],%o4
826	stx	%o4, [%o0-40]
827	sub	%o5, 64, %o5
828	ldx	[%o1+24],%o4
829	stx	%o4, [%o0-32]
830	ldx	[%o1+32],%o4
831	stx	%o4, [%o0-24]
832	ldx	[%o1+40],%o4
833	stx	%o4, [%o0-16]
834	ldx	[%o1+48],%o4
835	stx	%o4, [%o0-8]
836	add	%o1, 64, %o1
837	ldx	[%o1-8],%o4
838	bgu,pt	%XCC, .Lalign_loop_rest
839	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */
840
841	mov	ST_CHUNK, %o3
842	cmp	%o5, ST_CHUNK*64
843	bgu,pt	%XCC, .Lalign_loop_start
844	 add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */
845
846	cmp	%o5, 0
847	beq,pt	%XCC, .Lalign_done
848
849/* no prefetches needed in these loops
850 * since we are within ALIGN_PRE of the end */
851.Lalign_short:
852	 srl	%o5, 6, %o3
853.Lalign_loop_short:
854	subcc	%o3, 1, %o3
855	ldx	[%o1], %o4
856	add	%o1, 64, %o1
857	EX_ST(STORE_ASI(%o4, %o0))
858	bgu,pt	%XCC, .Lalign_loop_short
859	 add	%o0, 64, %o0
860
861	sub	%o1, %o5, %o1		/* reset %o1  */
862	sub	%o0, %o5, %o0		/* reset %o0  */
863
864	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
865.Lalign_short_rest:
866	ldx	[%o1+8],%o4
867	add	%o0, 64, %o0
868	stx	%o4, [%o0-48]
869	ldx	[%o1+16],%o4
870	subcc	%o5, 64, %o5
871	stx	%o4, [%o0-40]
872	ldx	[%o1+24],%o4
873	stx	%o4, [%o0-32]
874	ldx	[%o1+32],%o4
875	stx	%o4, [%o0-24]
876	ldx	[%o1+40],%o4
877	stx	%o4, [%o0-16]
878	ldx	[%o1+48],%o4
879	stx	%o4, [%o0-8]
880	add	%o1, 64, %o1
881	ldx	[%o1-8],%o4
882	bgu,pt	%XCC, .Lalign_short_rest
883	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */
884
885	add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */
886
887.Lalign_done:
888	cmp	%o2, 0
889	membar	#StoreStore
890	bne,pt	%XCC, .Lmedl63
891	 subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
892	retl
893	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
894
895	.align 16
896	/* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX  */
897	/* Since block load/store and BIS are not in use for unaligned data,
898	 * no need to align dst on 64 byte cache line boundary  */
899.Lunalignsetup:
900.Lunalignrejoin:
901	rd	%fprs, %g5		/* check for unused fp  */
902	/* if fprs.fef == 0, set it.
903	 * Setting it when already set costs more than checking */
904	andcc	%g5, FPRS_FEF, %g5	/* test FEF, fprs.du = fprs.dl = 0  */
905	bz,a	%XCC, 1f
906	 wr	%g0, FPRS_FEF, %fprs	/* fprs.fef = 1  */
9071:
908	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
909	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
910	cmp	%o2, 8			/* Insure we do not load beyond  */
911	bgt,pt	%XCC, .Lunalign_adjust	/* end of source buffer  */
912	 andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
913	add	%o2, 64, %o2		/* adjust to leave loop  */
914	sub	%o5, 64, %o5		/* early if necessary  */
915.Lunalign_adjust:
916	alignaddr %o1, %g0, %g0		/* generate %gsr  */
917	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
918	ldd	[%o4], %f0
919.Lunalign_loop:
920	prefetch [%o0 + (9 * BLOCK_SIZE)], 20
921	ldd	[%o4+8], %f2
922	faligndata %f0, %f2, %f16
923	ldd	[%o4+16], %f4
924	subcc	%o5, BLOCK_SIZE, %o5
925	std	%f16, [%o0]
926	faligndata %f2, %f4, %f18
927	ldd	[%o4+24], %f6
928	std	%f18, [%o0+8]
929	faligndata %f4, %f6, %f20
930	ldd	[%o4+32], %f8
931	std	%f20, [%o0+16]
932	faligndata %f6, %f8, %f22
933	ldd	[%o4+40], %f10
934	std	%f22, [%o0+24]
935	faligndata %f8, %f10, %f24
936	ldd	[%o4+48], %f12
937	std	%f24, [%o0+32]
938	faligndata %f10, %f12, %f26
939	ldd	[%o4+56], %f14
940	add	%o4, BLOCK_SIZE, %o4
941	std	%f26, [%o0+40]
942	faligndata %f12, %f14, %f28
943	ldd	[%o4], %f0
944	std	%f28, [%o0+48]
945	faligndata %f14, %f0, %f30
946	std	%f30, [%o0+56]
947	add	%o0, BLOCK_SIZE, %o0
948	bgu,pt	%XCC, .Lunalign_loop
949	 prefetch [%o4 + (11 * BLOCK_SIZE)], 20
950
951	/* Handle trailing bytes, 64 to 127
952	 * Dest long word aligned, Src not long word aligned  */
953	cmp	%o2, 15
954	bleu,pt	%XCC, .Lunalign_short
955
956	 andn	%o2, 0x7, %o5		/* %o5 is multiple of 8  */
957	and	%o2, 0x7, %o2		/* residue bytes in %o2  */
958	add	%o2, 8, %o2
959	sub	%o5, 8, %o5		/* do not load past end of src  */
960	andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
961	add	%o1, %o5, %o1		/* move %o1 to after multiple of 8  */
962	ldd	[%o4], %f0		/* fetch partial word  */
963.Lunalign_by8:
964	ldd	[%o4+8], %f2
965	add	%o4, 8, %o4
966	faligndata %f0, %f2, %f16
967	subcc	%o5, 8, %o5
968	std	%f16, [%o0]
969	fsrc2	%f2, %f0
970	bgu,pt	%XCC, .Lunalign_by8
971	 add	%o0, 8, %o0
972
973.Lunalign_short:			/* restore fprs state */
974	brnz,pt	%g5, .Lsmallrest
975	 nop
976	ba	.Lsmallrest
977	 wr	%g5, %g0, %fprs
978END(__memcpy_niagara7)
979
980#endif
981