sparc64/multiarch/memcpy-memmove-niagara7.S

/* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
   Copyright (C) 2017-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

#ifndef XCC
# define XCC    xcc
#endif
	.register	%g2,#scratch
	.register	%g3,#scratch
	.register	%g6,#scratch

#define	FPRS_FEF	0x04

/*
 * ASI_STBI_P marks the cache line as "least recently used"
 * which means if many threads are active, it has a high chance
 * of being pushed out of the cache between the first initializing
 * store and the final stores.
 * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
 * cache line as "most recently used" for all but the last cache
 * line.
 */

#define	ASI_BLK_INIT_QUAD_LDD_P	0xe2
#define	ASI_ST_BLK_INIT_MRU_P	0xf2

#define	ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
#define	ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P

#define	BLOCK_SIZE	64	/* L2 data cache line size  */
#define	SHORTCOPY	3
#define	SHORTCHECK	14
#define	SHORT_LONG	64	/* max copy for short longword-aligned case  */
				/* must be at least 64  */
#define	SMALL_MAX	255	/* max small copy for word/long aligned  */
#define	SMALL_UMAX	128	/* max small copy for unaligned case  */
#define	MED_WMAX	1023	/* max copy for medium word-aligned case  */
#define	MED_MAX		511	/* max copy for medium longword-aligned case  */
#define	ST_CHUNK	20	/* ST_CHUNK - block of values for BIS Store  */
/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
 * prefetch 20 can cause inst pipeline to delay if data is in memory
 * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache  */
#define	ALIGN_PRE	20	/* distance for aligned prefetch loop  */

#define EX_ST(x)	x
#define EX_RETVAL(x)	x
#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P

#if IS_IN (libc)

	.text

ENTRY(__memmove_niagara7)
	/* %o0=dst, %o1=src, %o2=len */
	cmp	%o1, %o0	/* if from address is >= to use forward copy  */
	bgeu,pn	%XCC, .Lforcpy	/* else use backward if ...  */
	 sub	%o0, %o1, %o4	/* get difference of two addresses  */
	cmp	%o2, %o4	/* compare size and difference of addresses  */
	bleu,pn	%XCC, .Lforcpy	/* if size is bigger, do overlapped copy  */
	 add	%o1, %o2, %o5	/* get to end of source space  */

/* an overlapped copy that must be done "backwards"  */
.Lchksize:
	cmp	%o2, 8			/* less than 8 byte do byte copy  */
	blu,pn %XCC, 2f			/* else continue  */

/* Now size is bigger than 8  */
.Ldbalign:
	 add	%o0, %o2, %g1		/* get to end of dest space  */
	andcc	%g1, 7, %o3		/* %o3 has cnt til dst 8 byte align  */
	bz,a,pn	%XCC, .Ldbbck		/* skip if dst is 8 byte aligned  */
	 andn	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
	sub	%o2, %o3, %o2		/* update o2 with new count  */

1:	dec	%o5			/* decrement source  */
	ldub	[%o5], %g1		/* load one byte  */
	deccc	%o3			/* decrement count  */
	bgu,pt	%XCC, 1b		/* if not done keep copying  */
	 stb	%g1, [%o5+%o4]		/* store one byte into dest  */
	andncc	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
	bz,pn	%XCC, 2f		/* if size < 8, move to byte copy  */

/* Now Destination is 8 byte aligned  */
.Ldbbck:
	 andcc	%o5, 7, %o0		/* %o0 has src offset  */
	bz,a,pn	%XCC, .Ldbcopybc	/* if src is aligned do fast memmove  */
	 sub	%o2, %o3, %o2		/* Residue bytes in %o2  */

.Lcpy_dbwdbc:				/* alignment of src is needed  */
	sub	%o2, 8, %o2		/* set size one loop ahead  */
	sll	%o0, 3, %g1		/* %g1 is left shift  */
	mov	64, %g5			/* init %g5 to be 64  */
	sub	%g5, %g1, %g5		/* %g5 rightshift = (64 - leftshift)  */
	sub	%o5, %o0, %o5		/* align the src at 8 bytes.  */
	add	%o4, %o0, %o4		/* increase diff between src & dst  */
	ldx	[%o5], %o1		/* load first 8 bytes  */
	srlx	%o1, %g5, %o1
1:	sub	%o5, 8, %o5		/* subtract 8 from src  */
	ldx	[%o5], %o0		/* load 8 byte  */
	sllx	%o0, %g1, %o3		/* shift loaded val left to tmp reg  */
	or	%o1, %o3, %o3		/* align data  */
	stx	%o3, [%o5+%o4]		/* store 8 byte  */
	subcc	%o2, 8, %o2		/* subtract 8 byte from size  */
	bg,pt	%XCC, 1b		/* if size > 0 continue  */
	 srlx	%o0, %g5, %o1		/* move extra byte for the next use  */

	srl	%g1, 3, %o0		/* restore %o0 value for alignment  */
	add	%o5, %o0, %o5		/* restore src alignment  */
	sub	%o4, %o0, %o4		/* restore diff between src & dest  */

	ba	2f			/* branch to the trailing byte copy  */
	 add	%o2, 8, %o2		/* restore size value  */

.Ldbcopybc:				/* alignment of src is not needed  */
1:	sub	%o5, 8, %o5		/* subtract from src  */
	ldx	[%o5], %g1		/* load 8 bytes  */
	subcc	%o3, 8, %o3		/* subtract from size  */
	bgu,pt	%XCC, 1b		/* if size is bigger 0 continue  */
	 stx	%g1, [%o5+%o4]		/* store 8 bytes to destination  */

	ba	2f
	 nop

.Lbcbyte:
1:	ldub	[%o5], %g1		/* load one byte  */
	stb	%g1, [%o5+%o4]		/* store one byte  */
2:	deccc	%o2			/* decrement size  */
	bgeu,a,pt %XCC, 1b		/* if size is >= 0 continue  */
	 dec	%o5			/* decrement from address  */

.Lexitbc:				/* exit from backward copy  */
	retl
	 add	%o5, %o4, %o0		/* restore dest addr  */


/* Check to see if memmove is large aligned copy
 * If so, use special version of copy that avoids
 * use of block store init.  */
.Lforcpy:
	cmp	%o2, SMALL_MAX		/* check for not small case  */
	blt,pn	%XCC, .Lmv_short	/* merge with memcpy  */
	 mov	%o0, %g1		/* save %o0  */
	neg	%o0, %o5
	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
	brz,pt	%o5, .Lmv_dst_aligned_on_8

/* %o5 has the bytes to be written in partial store.  */
	 sub	%o2, %o5, %o2
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
7:					/* dst aligning loop  */
	ldub	[%o1+%o0], %o4		/* load one byte  */
	subcc	%o5, 1, %o5
	stb	%o4, [%o0]
	bgu,pt	%XCC, 7b
	 add	%o0, 1, %o0		/* advance dst  */
	add	%o1, %o0, %o1		/* restore %o1  */
.Lmv_dst_aligned_on_8:
	andcc	%o1, 7, %o5
	brnz,pn	%o5, .Lsrc_dst_unaligned_on_8
	 prefetch [%o1 + (1 * BLOCK_SIZE)], 20

.Lmv_src_dst_aligned_on_8:
/* check if we are copying MED_MAX or more bytes  */
	cmp	%o2, MED_MAX		/* limit to store buffer size  */
	bleu,pt	%XCC, .Lmedlong
	 prefetch [%o1 + (2 * BLOCK_SIZE)], 20

/* The mv_align loop below mimics the memcpy code for large aligned copies,
 * but does not use the ASI_STBI_P (block initializing store) performance
 * optimization.  This is used when memcpy is incorrectly invoked with
 * overlapping buffers.  */

.Lmv_large_align8_copy:			/* Src and dst share 8 byte align  */
					/* align dst to 64 byte boundary  */
	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
	brz,pn	%o3, .Lmv_aligned_on_64
	 sub	%o3, 64, %o3		/* %o3 has negative bytes to move  */
	add	%o2, %o3, %o2		/* adjust remaining count  */
.Lmv_align_to_64:
	ldx	[%o1], %o4
	add	%o1, 8, %o1		/* increment src ptr  */
	addcc	%o3, 8, %o3
	stx	%o4, [%o0]
	brnz,pt	%o3, .Lmv_align_to_64
	 add	%o0, 8, %o0		/* increment dst ptr  */

.Lmv_aligned_on_64:
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
.Lmv_align_loop:
	ldx	[%o1],%o4
	stx	%o4,[%o0]
	prefetch [%o0 + (10 * BLOCK_SIZE)], 22
	prefetch [%o1 + (10 * BLOCK_SIZE)], 21
	subcc	%o5, 64, %o5
	ldx	[%o1+8],%o4
	stx	%o4,[%o0+8]
	ldx	[%o1+16],%o4
	stx	%o4,[%o0+16]
	ldx	[%o1+24],%o4
	stx	%o4,[%o0+24]
	ldx	[%o1+32],%o4
	stx	%o4,[%o0+32]
	ldx	[%o1+40],%o4
	stx	%o4,[%o0+40]
	ldx	[%o1+48],%o4
	add	%o1, 64, %o1
	stx	%o4,[%o0+48]
	add	%o0, 64, %o0
	ldx	[%o1-8],%o4
	bgt,pt	%XCC, .Lmv_align_loop
	 stx	%o4,[%o0-8]

	ba	.Lmedlong
	 nop
END(__memmove_niagara7)

ENTRY(__mempcpy_niagara7)
	/* %o0=dst, %o1=src, %o2=len */
	ba,pt	%icc, 101f
	 add	%o0, %o2, %g1		/* save dst + len  */
END(__mempcpy_niagara7)

	.align	32
ENTRY(__memcpy_niagara7)
100:	/* %o0=dst, %o1=src, %o2=len */
	mov	%o0, %g1		/* save %o0  */
101:
#ifndef __arch64__
	srl	%o2, 0, %o2
#endif
	cmp	%o2, SMALL_MAX		/* check for not small case  */
	bgeu,pn	%XCC, .Lmedium		/* go to larger cases  */
.Lmv_short:
	 cmp	%o2, SHORTCOPY		/* check for really short case  */
	ble,pn	%XCC, .Lsmallfin
	 or	%o0, %o1, %o4		/* prepare alignment check  */
	andcc	%o4, 0x3, %o5		/* test for word alignment  */
	bnz,pn	%XCC, .Lsmallunalign	/* branch to non-word aligned case  */
	 nop
	subcc	%o2, 7, %o2		/* adjust count  */
	ble,pn	%XCC, .Lsmallwordx
	 andcc	%o4, 0x7, %o5		/* test for long alignment  */
/* 8 or more bytes, src and dest start on word boundary
 * %o4 contains or %o0, %o1  */
.Lsmalllong:
	bnz,pn	%XCC, .Lsmallwords	/* branch to word aligned case  */
	 cmp	%o2, SHORT_LONG-7
	bge,a	%XCC, .Lmedl64		/* if we branch  */
	 sub	%o2,56,%o2		/* adjust %o2 to -63 off count  */

/* slightly unroll the small_long_loop to improve very short copies  */
	cmp	%o2, 32-7
	blt,a,pn %XCC, .Lsmall_long_l
	 sub	%o1, %o0, %o1		/* %o1 gets the difference  */

	ldx	[%o1], %o5
	ldx	[%o1+8], %o4
	ldx	[%o1+16], %o3

	subcc	%o2, 24, %o2
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

	stx	%o5, [%o0]		/* write word  */
	stx	%o4, [%o0+8]		/* write word  */
	stx	%o3, [%o0+16]		/* write word  */

	add	%o0, 24, %o0

/* end loop unroll  */

.Lsmall_long_l:
	ldx	[%o1+%o0], %o3
	subcc	%o2, 8, %o2
	add	%o0, 8, %o0
	bgu,pn	%XCC, .Lsmall_long_l	/* loop until done  */
	 stx	%o3, [%o0-8]		/* write word  */
	addcc	%o2, 7, %o2		/* restore %o2 to correct count  */
	bnz,pn	%XCC, .Lsmall_long_x	/* check for completion  */
	 add	%o1, %o0, %o1		/* restore %o1  */
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
.Lsmall_long_x:
	cmp	%o2, 4			/* check for 4 or more bytes left  */
	blt,pn	%XCC, .Lsmallleft3	/* if not, go to finish up  */
	 nop
	lduw	[%o1], %o3
	add	%o1, 4, %o1
	subcc	%o2, 4, %o2
	stw	%o3, [%o0]
	bnz,pn	%XCC, .Lsmallleft3
	 add	%o0, 4, %o0
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 32
/* src and dest start on word boundary; 7 or fewer bytes  */
.Lsmallwordx:
	lduw	[%o1], %o3		/* read word  */
	addcc	%o2, 3, %o2		/* restore count  */
	bz,pt	%XCC, .Lsmallexit
	 stw	%o3, [%o0]		/* write word  */
	deccc	%o2			/* reduce count for cc test  */
	ldub	[%o1+4], %o3		/* load one byte  */
	bz,pt	%XCC, .Lsmallexit
	 stb	%o3, [%o0+4]		/* store one byte  */
	ldub	[%o1+5], %o3		/* load second byte  */
	deccc	%o2
	bz,pt	%XCC, .Lsmallexit
	 stb	%o3, [%o0+5]		/* store second byte  */
	ldub	[%o1+6], %o3		/* load third byte  */
	stb	%o3, [%o0+6]		/* store third byte  */
.Lsmallexit:
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 32
.Lsmallunalign:
	cmp	%o2, SHORTCHECK
	ble,pn	%XCC, .Lsmallrest
	 cmp	%o2, SMALL_UMAX
	bge,pt	%XCC, .Lmedium_join
	 andcc	%o1, 0x3, %o5		/* is src word aligned  */
	bz,pn	%XCC, .Laldst
	 cmp	%o5, 2			/* is src half-word aligned  */
	be,pt	%XCC, .Ls2algn
	 cmp	%o5, 3			/* src is byte aligned  */
.Ls1algn:
	ldub	[%o1], %o3		/* move 1 or 3 bytes to align it  */
	inc	1, %o1
	stb	%o3, [%o0]		/* move a byte to align src  */
	inc	1, %o0
	bne,pt	%XCC, .Ls2algn
	 dec	%o2
	b	.Lald			/* now go align dest  */
	 andcc	%o0, 0x3, %o5

.Ls2algn:
	lduh	[%o1], %o3		/* know src is 2 byte aligned  */
	inc	2, %o1
	srl	%o3, 8, %o4
	stb	%o4, [%o0]		/* have to do bytes,  */
	stb	%o3, [%o0 + 1]		/* do not know dst alignment  */
	inc	2, %o0
	dec	2, %o2

.Laldst:
	andcc	%o0, 0x3, %o5		/* align the destination address  */
.Lald:
	bz,pn	%XCC, .Lw4cp
	 cmp	%o5, 2
	be,pn	%XCC, .Lw2cp
	 cmp	%o5, 3
.Lw3cp:	lduw	[%o1], %o4
	inc	4, %o1
	srl	%o4, 24, %o5
	stb	%o5, [%o0]
	bne,pt	%XCC, .Lw1cp
	 inc	%o0
	dec	1, %o2
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
	dec	4, %o3			/* avoid reading beyond tail of src  */
	sub	%o1, %o0, %o1		/*  %o1 gets the difference  */

1:	sll	%o4, 8, %g5		/* save residual bytes  */
	lduw	[%o1+%o0], %o4
	deccc	4, %o3
	srl	%o4, 24, %o5		/* merge with residual  */
	or	%o5, %g5, %g5
	st	%g5, [%o0]
	bnz,pt	%XCC, 1b
	 inc	4, %o0
	sub	%o1, 3, %o1		/* used one byte of last word read  */
	and	%o2, 3, %o2
	b	7f
	 inc	4, %o2

.Lw1cp:	srl	%o4, 8, %o5
	sth	%o5, [%o0]
	inc	2, %o0
	dec	3, %o2
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
	dec	4, %o3			/* avoid reading beyond tail of src  */
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

2:	sll	%o4, 24, %g5		/* save residual bytes  */
	lduw	[%o1+%o0], %o4
	deccc	4, %o3
	srl	%o4, 8, %o5		/* merge with residual  */
	or	%o5, %g5, %g5
	st	%g5, [%o0]
	bnz,pt	%XCC, 2b
	 inc	4, %o0
	sub	%o1, 1, %o1		/* used 3 bytes of last word read  */
	and	%o2, 3, %o2
	b	7f
	 inc	4, %o2

.Lw2cp:	lduw	[%o1], %o4
	inc	4, %o1
	srl	%o4, 16, %o5
	sth	%o5, [%o0]
	inc	2, %o0
	dec	2, %o2
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
	dec	4, %o3			/* avoid reading beyond tail of src  */
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

3:	sll	%o4, 16, %g5		/* save residual bytes  */
	lduw	[%o1+%o0], %o4
	deccc	4, %o3
	srl	%o4, 16, %o5		/* merge with residual  */
	or	%o5, %g5, %g5
	st	%g5, [%o0]
	bnz,pt	%XCC, 3b
	 inc	4, %o0
	sub	%o1, 2, %o1		/* used two bytes of last word read  */
	and	%o2, 3, %o2
	b	7f
	 inc	4, %o2

.Lw4cp:	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

1:	lduw	[%o1+%o0], %o4		/* read from address  */
	deccc	4, %o3			/* decrement count  */
	st	%o4, [%o0]		/* write at destination address  */
	bgu,pt	%XCC, 1b
	 inc	4, %o0			/* increment to address  */
	and	%o2, 3, %o2		/* number of leftover bytes, if any  */

	/* simple finish up byte copy, works with any alignment  */
7:
	add	%o1, %o0, %o1		/* restore %o1  */
.Lsmallrest:
	tst	%o2
	bz,pt	%XCC, .Lsmallx
	 cmp	%o2, 4
	blt,pn	%XCC, .Lsmallleft3
	 nop
	sub	%o2, 3, %o2
.Lsmallnotalign4:
	ldub	[%o1], %o3		/* read byte  */
	subcc	%o2, 4, %o2		/* reduce count by 4  */
	stb	%o3, [%o0]		/* write byte  */
	ldub	[%o1+1], %o3		/* repeat for total of 4 bytes  */
	add	%o1, 4, %o1		/* advance SRC by 4  */
	stb	%o3, [%o0+1]
	ldub	[%o1-2], %o3
	add	%o0, 4, %o0		/* advance DST by 4  */
	stb	%o3, [%o0-2]
	ldub	[%o1-1], %o3
	bgu,pt	%XCC, .Lsmallnotalign4	/* loop til 3 or fewer bytes remain  */
	 stb	%o3, [%o0-1]
	addcc	%o2, 3, %o2		/* restore count  */
	bz,pt	%XCC, .Lsmallx
.Lsmallleft3:				/* 1, 2, or 3 bytes remain  */
	 subcc	%o2, 1, %o2
	ldub	[%o1], %o3		/* load one byte  */
	bz,pt	%XCC, .Lsmallx
	 stb	%o3, [%o0]		/* store one byte  */
	ldub	[%o1+1], %o3		/* load second byte  */
	subcc	%o2, 1, %o2
	bz,pt	%XCC, .Lsmallx
	 stb	%o3, [%o0+1]		/* store second byte  */
	ldub	[%o1+2], %o3		/* load third byte  */
	stb	%o3, [%o0+2]		/* store third byte  */
.Lsmallx:
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

.Lsmallfin:
	tst	%o2
	bnz,pn	%XCC, .Lsmallleft3
	 nop
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 16
.Lsmallwords:
	lduw	[%o1], %o3		/* read word  */
	subcc	%o2, 8, %o2		/* update count  */
	stw	%o3, [%o0]		/* write word  */
	add	%o1, 8, %o1		/* update SRC  */
	lduw	[%o1-4], %o3		/* read word  */
	add	%o0, 8, %o0		/* update DST  */
	bgu,pt	%XCC, .Lsmallwords	/* loop until done  */
	 stw	%o3, [%o0-4]		/* write word  */
	addcc	%o2, 7, %o2		/* restore count  */
	bz,pt	%XCC, .Lsmallexit	/* check for completion  */
	 cmp	%o2, 4			/* check for 4 or more bytes left  */
	blt,pt	%XCC, .Lsmallleft3	/* if not, go to finish up  */
	 nop
	lduw	[%o1], %o3
	add	%o1, 4, %o1
	subcc	%o2, 4, %o2
	add	%o0, 4, %o0
	bnz,pn	%XCC, .Lsmallleft3
	 stw	%o3, [%o0-4]
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 16
.Lmedium:
.Lmedium_join:
	neg	%o0, %o5
	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
	brz,pt	%o5, .Ldst_aligned_on_8

	/* %o5 has the bytes to be written in partial store.  */
	 sub	%o2, %o5, %o2
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
7:					/* dst aligning loop  */
	ldub	[%o1+%o0], %o4		/* load one byte  */
	subcc	%o5, 1, %o5
	stb	%o4, [%o0]
	bgu,pt	%XCC, 7b
	 add	%o0, 1, %o0		/* advance dst  */
	add	%o1, %o0, %o1		/* restore %o1  */
.Ldst_aligned_on_8:
	andcc	%o1, 7, %o5
	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
	 nop

.Lsrc_dst_aligned_on_8:
	/* check if we are copying MED_MAX or more bytes  */
	cmp	%o2, MED_MAX		/* limit to store buffer size  */
	bgu,pn	%XCC, .Llarge_align8_copy
	 nop
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is less than MED_MAX bytes
 */
.Lmedlong:
	subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
	ble,pn	%XCC, .Lmedl63		/* skip big loop if < 64 bytes  */
	 nop
.Lmedl64:
	ldx	[%o1], %o4		/* load  */
	subcc	%o2, 64, %o2		/* decrement length count  */
	stx	%o4, [%o0]		/* and store  */
	ldx	[%o1+8], %o3		/* a block of 64 bytes  */
	stx	%o3, [%o0+8]
	ldx	[%o1+16], %o4
	stx	%o4, [%o0+16]
	ldx	[%o1+24], %o3
	stx	%o3, [%o0+24]
	ldx	[%o1+32], %o4		/* load  */
	stx	%o4, [%o0+32]		/* and store  */
	ldx	[%o1+40], %o3		/* a block of 64 bytes  */
	add	%o1, 64, %o1		/* increase src ptr by 64  */
	stx	%o3, [%o0+40]
	ldx	[%o1-16], %o4
	add	%o0, 64, %o0		/* increase dst ptr by 64  */
	stx	%o4, [%o0-16]
	ldx	[%o1-8], %o3
	bgu,pt	%XCC, .Lmedl64		/* repeat if at least 64 bytes left  */
	 stx	%o3, [%o0-8]
.Lmedl63:
	addcc	%o2, 32, %o2		/* adjust remaining count  */
	ble,pt	%XCC, .Lmedl31		/* to skip if 31 or fewer bytes left  */
	 nop
	ldx	[%o1], %o4		/* load  */
	sub	%o2, 32, %o2		/* decrement length count  */
	stx	%o4, [%o0]		/* and store  */
	ldx	[%o1+8], %o3		/* a block of 32 bytes  */
	add	%o1, 32, %o1		/* increase src ptr by 32  */
	stx	%o3, [%o0+8]
	ldx	[%o1-16], %o4
	add	%o0, 32, %o0		/* increase dst ptr by 32  */
	stx	%o4, [%o0-16]
	ldx	[%o1-8], %o3
	stx	%o3, [%o0-8]
.Lmedl31:
	addcc	%o2, 16, %o2		/* adjust remaining count  */
	ble,pt	%XCC, .Lmedl15		/* skip if 15 or fewer bytes left  */
	 nop
	ldx	[%o1], %o4		/* load and store 16 bytes  */
	add	%o1, 16, %o1		/* increase src ptr by 16  */
	stx	%o4, [%o0]
	sub	%o2, 16, %o2		/* decrease count by 16  */
	ldx	[%o1-8], %o3
	add	%o0, 16, %o0		/* increase dst ptr by 16  */
	stx	%o3, [%o0-8]
.Lmedl15:
	addcc	%o2, 15, %o2		/* restore count  */
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
	 cmp	%o2, 8
	blt,pt	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
	 tst	%o2
	ldx	[%o1], %o4		/* load 8 bytes  */
	add	%o1, 8, %o1		/* increase src ptr by 8  */
	add	%o0, 8, %o0		/* increase dst ptr by 8  */
	subcc	%o2, 8, %o2		/* decrease count by 8  */
	bnz,pn	%XCC, .Lmedw7
	 stx	%o4, [%o0-8]		/* and store 8 bytes  */
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 16
.Lsrc_dst_unaligned_on_8:
	/* DST is 8-byte aligned, src is not  */
	andcc	%o1, 0x3, %o5		/* test word alignment  */
	bnz,pt	%XCC, .Lunalignsetup	/* branch if not word aligned  */
	 nop

/*
 * Handle all cases where src and dest are aligned on word
 * boundaries. Use unrolled loops for better performance.
 * This option wins over standard large data move when
 * source and destination is in cache for medium
 * to short data moves.
 */
	cmp %o2, MED_WMAX		/* limit to store buffer size  */
	bge,pt	%XCC, .Lunalignrejoin	/* otherwise rejoin main loop  */
	 nop

	subcc	%o2, 31, %o2		/* adjust length to allow cc test  */
					/* for end of loop  */
	ble,pt	%XCC, .Lmedw31		/* skip big loop if less than 16  */
.Lmedw32:
	 ld	[%o1], %o4		/* move a block of 32 bytes  */
	sllx	%o4, 32, %o5
	ld	[%o1+4], %o4
	or	%o4, %o5, %o5
	stx	%o5, [%o0]
	subcc	%o2, 32, %o2		/* decrement length count  */
	ld	[%o1+8], %o4
	sllx	%o4, 32, %o5
	ld	[%o1+12], %o4
	or	%o4, %o5, %o5
	stx	%o5, [%o0+8]
	add	%o1, 32, %o1		/* increase src ptr by 32  */
	ld	[%o1-16], %o4
	sllx	%o4, 32, %o5
	ld	[%o1-12], %o4
	or	%o4, %o5, %o5
	stx	%o5, [%o0+16]
	add	%o0, 32, %o0		/* increase dst ptr by 32  */
	ld	[%o1-8], %o4
	sllx	%o4, 32, %o5
	ld	[%o1-4], %o4
	or	%o4, %o5, %o5
	bgu,pt	%XCC, .Lmedw32		/* repeat if at least 32 bytes left  */
	 stx	%o5, [%o0-8]
.Lmedw31:
	addcc	%o2, 31, %o2		/* restore count  */
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
	 cmp	%o2, 16
	blt,pt	%XCC, .Lmedw15
	 nop
	ld	[%o1], %o4		/* move a block of 16 bytes  */
	sllx	%o4, 32, %o5
	subcc	%o2, 16, %o2		/* decrement length count  */
	ld	[%o1+4], %o4
	or	%o4, %o5, %o5
	stx	%o5, [%o0]
	add	%o1, 16, %o1		/* increase src ptr by 16  */
	ld	[%o1-8], %o4
	add	%o0, 16, %o0		/* increase dst ptr by 16  */
	sllx	%o4, 32, %o5
	ld	[%o1-4], %o4
	or	%o4, %o5, %o5
	stx	%o5, [%o0-8]
.Lmedw15:
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
	 cmp	%o2, 8
	blt,pn	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
	 tst	%o2
	ld	[%o1], %o4		/* load 4 bytes  */
	subcc	%o2, 8, %o2		/* decrease count by 8  */
	stw	%o4, [%o0]		/* and store 4 bytes  */
	add	%o1, 8, %o1		/* increase src ptr by 8  */
	ld	[%o1-4], %o3		/* load 4 bytes  */
	add	%o0, 8, %o0		/* increase dst ptr by 8  */
	stw	%o3, [%o0-4]		/* and store 4 bytes  */
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
.Lmedw7:				/* count is ge 1, less than 8  */
	 cmp	%o2, 4			/* check for 4 bytes left  */
	blt,pn	%XCC, .Lsmallleft3	/* skip if 3 or fewer bytes left  */
	 nop
	ld	[%o1], %o4		/* load 4 bytes  */
	add	%o1, 4, %o1		/* increase src ptr by 4  */
	add	%o0, 4, %o0		/* increase dst ptr by 4  */
	subcc	%o2, 4, %o2		/* decrease count by 4  */
	bnz,pt	%XCC, .Lsmallleft3
	 stw	%o4, [%o0-4]		/* and store 4 bytes  */
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 16
.Llarge_align8_copy:			/* Src and dst 8 byte aligned  */
	/* align dst to 64 byte boundary  */
	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
	brz,pn	%o3, .Laligned_to_64
	 andcc	%o0, 8, %o3		/* odd long words to move?  */
	brz,pt	%o3, .Laligned_to_16
	 nop
	ldx	[%o1], %o4
	sub	%o2, 8, %o2
	add	%o1, 8, %o1		/* increment src ptr  */
	add	%o0, 8, %o0		/* increment dst ptr  */
	stx	%o4, [%o0-8]
.Laligned_to_16:
	andcc	%o0, 16, %o3		/* pair of long words to move?  */
	brz,pt	%o3, .Laligned_to_32
	 nop
	ldx	[%o1], %o4
	sub	%o2, 16, %o2
	stx	%o4, [%o0]
	add	%o1, 16, %o1		/* increment src ptr  */
	ldx	[%o1-8], %o4
	add	%o0, 16, %o0		/* increment dst ptr  */
	stx	%o4, [%o0-8]
.Laligned_to_32:
	andcc	%o0, 32, %o3		/* four long words to move?  */
	brz,pt	%o3, .Laligned_to_64
	 nop
	ldx	[%o1], %o4
	sub	%o2, 32, %o2
	stx	%o4, [%o0]
	ldx	[%o1+8], %o4
	stx	%o4, [%o0+8]
	ldx	[%o1+16], %o4
	stx	%o4, [%o0+16]
	add	%o1, 32, %o1		/* increment src ptr  */
	ldx	[%o1-8], %o4
	add	%o0, 32, %o0		/* increment dst ptr  */
	stx	%o4, [%o0-8]
.Laligned_to_64:
/*	Following test is included to avoid issues where existing executables
 *	incorrectly call memcpy with overlapping src and dest instead of memmove
 *
 *	if ( (src ge dst) and (dst+len > src)) go to overlap case
 *	if ( (src lt dst) and (src+len > dst)) go to overlap case
 */
	cmp	%o1,%o0
	bge,pt	%XCC, 1f
	 nop
/*				src+len > dst?  */
	add	%o1, %o2, %o4
	cmp	%o4, %o0
	bgt,pt	%XCC, .Lmv_aligned_on_64
	 nop
	ba	2f
	 nop
1:
/*				dst+len > src?  */
	add	%o0, %o2, %o4
	cmp	%o4, %o1
	bgt,pt	%XCC, .Lmv_aligned_on_64
	 nop
2:
/*	handle non-overlapped copies
 *
 *	Using block init store (BIS) instructions to avoid fetching cache
 *	lines from memory. Use ST_CHUNK stores to first element of each cache
 *	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
 *	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
 */
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */

/*	We use ASI_STBIMRU_P for the first store to each cache line
 *	followed by ASI_STBI_P (mark as LRU) for the last store. That
 *	mixed approach reduces the chances the cache line is removed
 *	before we finish setting it, while minimizing the effects on
 *	other cached values during a large memcpy
 *
 *	Intermediate stores can be normal since first BIS activates the
 *	cache line in the L2 cache.
 *
 *	ST_CHUNK batches up initial BIS operations for several cache lines
 *	to allow multiple requests to not be blocked by overflowing the
 *	the store miss buffer. Then the matching stores for all those
 *	BIS operations are executed.
 */

.Lalign_loop:
	cmp	%o5, ST_CHUNK*64
	blu,pt	%XCC, .Lalign_short
	 mov	ST_CHUNK, %o3
	sllx	%o3, 6, %g5		/* ST_CHUNK*64  */

.Lalign_loop_start:
	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
	subcc	%o3, 2, %o3
	ldx	[%o1], %o4
	add	%o1, 128, %o1
	EX_ST(STORE_ASI(%o4, %o0))
	add	%o0, 64, %o0
	ldx	[%o1-64], %o4
	EX_ST(STORE_ASI(%o4, %o0))
	add	%o0, 64, %o0
	bgu,pt	%XCC, .Lalign_loop_start
	 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21

	mov	ST_CHUNK, %o3
	sub	%o1, %g5, %o1		/* reset %o1  */
	sub	%o0, %g5, %o0		/* reset %o0  */

	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
.Lalign_loop_rest:
	ldx	[%o1+8],%o4
	add	%o0, 64, %o0
	stx	%o4, [%o0-48]
	subcc	%o3, 1, %o3
	ldx	[%o1+16],%o4
	stx	%o4, [%o0-40]
	sub	%o5, 64, %o5
	ldx	[%o1+24],%o4
	stx	%o4, [%o0-32]
	ldx	[%o1+32],%o4
	stx	%o4, [%o0-24]
	ldx	[%o1+40],%o4
	stx	%o4, [%o0-16]
	ldx	[%o1+48],%o4
	stx	%o4, [%o0-8]
	add	%o1, 64, %o1
	ldx	[%o1-8],%o4
	bgu,pt	%XCC, .Lalign_loop_rest
	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */

	mov	ST_CHUNK, %o3
	cmp	%o5, ST_CHUNK*64
	bgu,pt	%XCC, .Lalign_loop_start
	 add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */

	cmp	%o5, 0
	beq,pt	%XCC, .Lalign_done

/* no prefetches needed in these loops
 * since we are within ALIGN_PRE of the end */
.Lalign_short:
	 srl	%o5, 6, %o3
.Lalign_loop_short:
	subcc	%o3, 1, %o3
	ldx	[%o1], %o4
	add	%o1, 64, %o1
	EX_ST(STORE_ASI(%o4, %o0))
	bgu,pt	%XCC, .Lalign_loop_short
	 add	%o0, 64, %o0

	sub	%o1, %o5, %o1		/* reset %o1  */
	sub	%o0, %o5, %o0		/* reset %o0  */

	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
.Lalign_short_rest:
	ldx	[%o1+8],%o4
	add	%o0, 64, %o0
	stx	%o4, [%o0-48]
	ldx	[%o1+16],%o4
	subcc	%o5, 64, %o5
	stx	%o4, [%o0-40]
	ldx	[%o1+24],%o4
	stx	%o4, [%o0-32]
	ldx	[%o1+32],%o4
	stx	%o4, [%o0-24]
	ldx	[%o1+40],%o4
	stx	%o4, [%o0-16]
	ldx	[%o1+48],%o4
	stx	%o4, [%o0-8]
	add	%o1, 64, %o1
	ldx	[%o1-8],%o4
	bgu,pt	%XCC, .Lalign_short_rest
	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */

	add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */

.Lalign_done:
	cmp	%o2, 0
	membar	#StoreStore
	bne,pt	%XCC, .Lmedl63
	 subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
	retl
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

	.align 16
	/* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX  */
	/* Since block load/store and BIS are not in use for unaligned data,
	 * no need to align dst on 64 byte cache line boundary  */
.Lunalignsetup:
.Lunalignrejoin:
	rd	%fprs, %g5		/* check for unused fp  */
	/* if fprs.fef == 0, set it.
	 * Setting it when already set costs more than checking */
	andcc	%g5, FPRS_FEF, %g5	/* test FEF, fprs.du = fprs.dl = 0  */
	bz,a	%XCC, 1f
	 wr	%g0, FPRS_FEF, %fprs	/* fprs.fef = 1  */
1:
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
	cmp	%o2, 8			/* Insure we do not load beyond  */
	bgt,pt	%XCC, .Lunalign_adjust	/* end of source buffer  */
	 andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
	add	%o2, 64, %o2		/* adjust to leave loop  */
	sub	%o5, 64, %o5		/* early if necessary  */
.Lunalign_adjust:
	alignaddr %o1, %g0, %g0		/* generate %gsr  */
	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
	ldd	[%o4], %f0
.Lunalign_loop:
	prefetch [%o0 + (9 * BLOCK_SIZE)], 20
	ldd	[%o4+8], %f2
	faligndata %f0, %f2, %f16
	ldd	[%o4+16], %f4
	subcc	%o5, BLOCK_SIZE, %o5
	std	%f16, [%o0]
	faligndata %f2, %f4, %f18
	ldd	[%o4+24], %f6
	std	%f18, [%o0+8]
	faligndata %f4, %f6, %f20
	ldd	[%o4+32], %f8
	std	%f20, [%o0+16]
	faligndata %f6, %f8, %f22
	ldd	[%o4+40], %f10
	std	%f22, [%o0+24]
	faligndata %f8, %f10, %f24
	ldd	[%o4+48], %f12
	std	%f24, [%o0+32]
	faligndata %f10, %f12, %f26
	ldd	[%o4+56], %f14
	add	%o4, BLOCK_SIZE, %o4
	std	%f26, [%o0+40]
	faligndata %f12, %f14, %f28
	ldd	[%o4], %f0
	std	%f28, [%o0+48]
	faligndata %f14, %f0, %f30
	std	%f30, [%o0+56]
	add	%o0, BLOCK_SIZE, %o0
	bgu,pt	%XCC, .Lunalign_loop
	 prefetch [%o4 + (11 * BLOCK_SIZE)], 20

	/* Handle trailing bytes, 64 to 127
	 * Dest long word aligned, Src not long word aligned  */
	cmp	%o2, 15
	bleu,pt	%XCC, .Lunalign_short

	 andn	%o2, 0x7, %o5		/* %o5 is multiple of 8  */
	and	%o2, 0x7, %o2		/* residue bytes in %o2  */
	add	%o2, 8, %o2
	sub	%o5, 8, %o5		/* do not load past end of src  */
	andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
	add	%o1, %o5, %o1		/* move %o1 to after multiple of 8  */
	ldd	[%o4], %f0		/* fetch partial word  */
.Lunalign_by8:
	ldd	[%o4+8], %f2
	add	%o4, 8, %o4
	faligndata %f0, %f2, %f16
	subcc	%o5, 8, %o5
	std	%f16, [%o0]
	fsrc2	%f2, %f0
	bgu,pt	%XCC, .Lunalign_by8
	 add	%o0, 8, %o0

.Lunalign_short:			/* restore fprs state */
	brnz,pt	%g5, .Lsmallrest
	 nop
	ba	.Lsmallrest
	 wr	%g5, %g0, %fprs
END(__memcpy_niagara7)

#endif