1/* Optimized version of the standard memmove() function.
2   This file is part of the GNU C Library.
3   Copyright (C) 2000-2022 Free Software Foundation, Inc.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* Return: dest
20
21   Inputs:
22        in0:    dest
23        in1:    src
24        in2:    byte count
25
26   The core of the function is the memcpy implementation used in memcpy.S.
27   When bytes have to be copied backwards, only the easy case, when
28   all arguments are multiples of 8, is optimised.
29
30   In this form, it assumes little endian mode.  For big endian mode,
31   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
32   or the UM.be bit should be cleared at the beginning and set at the end.  */
33
34#include <sysdep.h>
35#undef ret
36
37#define OP_T_THRES 	16
38#define OPSIZ 		 8
39
40#define adest		r15
41#define saved_pr	r17
42#define saved_lc	r18
43#define dest		r19
44#define src		r20
45#define len		r21
46#define asrc		r22
47#define tmp2		r23
48#define tmp3		r24
49#define	tmp4		r25
50#define ptable		r26
51#define ploop56		r27
52#define	loopaddr	r28
53#define	sh1		r29
54#define loopcnt		r30
55#define	value		r31
56
57#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
58# define ALIGN(n)	{ nop 0 }
59#else
60# define ALIGN(n)	.align n
61#endif
62
63#define LOOP(shift)							\
64		ALIGN(32);						\
65.loop##shift##:								\
66(p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
67(p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
68(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\
69		nop.b	0 ;						\
70		nop.b	0 ;						\
71		br.ctop.sptk .loop##shift ;				\
72		br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
73
74#define MEMLAT	21
75#define Nrot	(((2*MEMLAT+3) + 7) & ~7)
76
77ENTRY(memmove)
78	.prologue
79	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
80	.rotr	r[MEMLAT + 2], q[MEMLAT + 1]
81	.rotp	p[MEMLAT + 2]
82	mov	ret0 = in0		// return value = dest
83	.save pr, saved_pr
84	mov	saved_pr = pr		// save the predicate registers
85	.save ar.lc, saved_lc
86        mov 	saved_lc = ar.lc	// save the loop counter
87	.body
88	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
89	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
90	mov 	dest = in0		// dest
91	mov 	src = in1		// src
92	mov	len = in2		// len
93	sub	tmp2 = r0, in0		// tmp2 = -dest
94	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
95(p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
96	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
97	cmp.le	p6, p0 = dest, src	// if dest <= src it's always safe
98(p6)	br.cond.spnt .forward		// to copy forward
99	add	tmp3 = src, len;;
100	cmp.lt	p6, p0 = dest, tmp3	// if dest > src && dest < src + len
101(p6)	br.cond.spnt .backward		// we have to copy backward
102
103.forward:
104	shr.u	loopcnt = len, 4 ;;	// loopcnt = len / 16
105	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
106(p6)	br.cond.sptk .next		//	goto next;
107
108// The optimal case, when dest, src and len are all multiples of 8
109
110	and	tmp3 = 0xf, len
111	mov	pr.rot = 1 << 16	// set rotating predicates
112	mov	ar.ec = MEMLAT + 1 ;;	// set the epilog counter
113	cmp.ne	p6, p0 = tmp3, r0	// do we have to copy an extra word?
114	adds	loopcnt = -1, loopcnt;;	// --loopcnt
115(p6)	ld8	value = [src], 8;;
116(p6)	st8	[dest] = value, 8	// copy the "odd" word
117	mov	ar.lc = loopcnt 	// set the loop counter
118	cmp.eq	p6, p0 = 8, len
119(p6)	br.cond.spnt .restore_and_exit;;// the one-word special case
120	adds	adest = 8, dest		// set adest one word ahead of dest
121	adds	asrc = 8, src ;;	// set asrc one word ahead of src
122	nop.b	0			// get the "golden" alignment for
123	nop.b	0			// the next loop
124.l0:
125(p[0])		ld8	r[0] = [src], 16
126(p[0])		ld8	q[0] = [asrc], 16
127(p[MEMLAT])	st8	[dest] = r[MEMLAT], 16
128(p[MEMLAT])	st8	[adest] = q[MEMLAT], 16
129		br.ctop.dptk .l0 ;;
130
131	mov	pr = saved_pr, -1	// restore the predicate registers
132	mov	ar.lc = saved_lc	// restore the loop counter
133	br.ret.sptk.many b0
134.next:
135	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
136	and	loopcnt = 7, tmp2 		// loopcnt = -dest % 8
137(p6)	br.cond.spnt	.cpyfew			// copy byte by byte
138	;;
139	cmp.eq	p6, p0 = loopcnt, r0
140(p6)	br.cond.sptk	.dest_aligned
141	sub	len = len, loopcnt	// len -= -dest % 8
142	adds	loopcnt = -1, loopcnt	// --loopcnt
143	;;
144	mov	ar.lc = loopcnt
145.l1:					// copy -dest % 8 bytes
146	ld1	value = [src], 1	// value = *src++
147	;;
148	st1	[dest] = value, 1	// *dest++ = value
149	br.cloop.dptk .l1
150.dest_aligned:
151	and	sh1 = 7, src 		// sh1 = src % 8
152	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
153	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
154	shr.u	loopcnt = len, 3	// loopcnt = len / 8
155	and	len = 7, len;;		// len = len % 8
156	adds	loopcnt = -1, loopcnt	// --loopcnt
157	addl	tmp4 = @ltoff(.table), gp
158	addl	tmp3 = @ltoff(.loop56), gp
159	mov     ar.ec = MEMLAT + 1	// set EC
160	mov     pr.rot = 1 << 16;;	// set rotating predicates
161	mov	ar.lc = loopcnt		// set LC
162	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
163(p6)    br.cond.sptk .src_aligned
164	add	src = src, tmp2		// src += len & -OPSIZ
165	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
166	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
167	ld8	ptable = [tmp4];;	// ptable = &table
168	add	tmp3 = ptable, sh1;;	// tmp3 = &table + sh1
169	mov	ar.ec = MEMLAT + 1 + 1 // one more pass needed
170	ld8	tmp4 = [tmp3];;		// tmp4 = loop offset
171	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
172	ld8	r[1] = [asrc], 8;;	// w0
173	mov	b6 = loopaddr;;
174	br	b6			// jump to the appropriate loop
175
176	LOOP(8)
177	LOOP(16)
178	LOOP(24)
179	LOOP(32)
180	LOOP(40)
181	LOOP(48)
182	LOOP(56)
183
184.src_aligned:
185.l3:
186(p[0])		ld8	r[0] = [src], 8
187(p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
188		br.ctop.dptk .l3
189.cpyfew:
190	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
191	adds	len = -1, len		// --len;
192(p6)	br.cond.spnt	.restore_and_exit ;;
193	mov	ar.lc = len
194.l4:
195	ld1	value = [src], 1
196	;;
197	st1	[dest] = value, 1
198	br.cloop.dptk	.l4 ;;
199.restore_and_exit:
200	mov     pr = saved_pr, -1    	// restore the predicate registers
201	mov 	ar.lc = saved_lc	// restore the loop counter
202	br.ret.sptk.many b0
203
204// In the case of a backward copy, optimise only the case when everything
205// is a multiple of 8, otherwise copy byte by byte.  The backward copy is
206// used only when the blocks are overlapping and dest > src.
207
208.backward:
209	shr.u	loopcnt = len, 3	// loopcnt = len / 8
210	add	src = src, len		// src points one byte past the end
211	add	dest = dest, len ;; 	// dest points one byte past the end
212	mov	ar.ec = MEMLAT + 1	// set the epilog counter
213	mov	pr.rot = 1 << 16	// set rotating predicates
214	adds	loopcnt = -1, loopcnt	// --loopcnt
215	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
216(p6)	br.cond.sptk .bytecopy ;;	// copy byte by byte backward
217	adds	src = -8, src		// src points to the last word
218	adds	dest = -8, dest 	// dest points to the last word
219	mov	ar.lc = loopcnt;;	// set the loop counter
220.l5:
221(p[0])		ld8	r[0] = [src], -8
222(p[MEMLAT])	st8	[dest] = r[MEMLAT], -8
223		br.ctop.dptk .l5
224		br.cond.sptk .restore_and_exit
225.bytecopy:
226	adds	src = -1, src		// src points to the last byte
227	adds	dest = -1, dest		// dest points to the last byte
228	adds	loopcnt = -1, len;;	// loopcnt = len - 1
229	mov	ar.lc = loopcnt;;	// set the loop counter
230.l6:
231(p[0])		ld1	r[0] = [src], -1
232(p[MEMLAT])	st1	[dest] = r[MEMLAT], -1
233		br.ctop.dptk .l6
234		br.cond.sptk .restore_and_exit
235END(memmove)
236
237	.rodata
238	.align 8
239.table:
240	data8	0			// dummy entry
241	data8 	.loop56 - .loop8
242	data8 	.loop56 - .loop16
243	data8 	.loop56 - .loop24
244	data8	.loop56 - .loop32
245	data8	.loop56 - .loop40
246	data8	.loop56 - .loop48
247	data8	.loop56 - .loop56
248
249libc_hidden_builtin_def (memmove)
250