1/* Optimized memmove implementation for POWER10.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21
22/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
23
24   This optimization checks if 'src' and 'dst' overlap.  If they do not
25   or 'src' is ahead of 'dest' then it copies forward.
26   Otherwise, an optimized backward copy is used.  */
27
28#ifndef MEMMOVE
29# define MEMMOVE memmove
30#endif
31	.machine power9
32ENTRY_TOCLESS (MEMMOVE, 5)
33	CALL_MCOUNT 3
34
35L(_memmove):
36	.p2align 5
37	/* Check if there is overlap, if so it will branch to backward copy.  */
38	subf	r9,r4,r3
39	cmpld	cr7,r9,r5
40	blt	cr7,L(memmove_bwd)
41
42	/* Fast path for length shorter than 16 bytes.  */
43	sldi	r7,r5,56
44	lxvl	32+v2,r4,r7
45	stxvl	32+v2,r3,r7
46	subic.	r8,r5,16
47	blelr
48
49	/* For shorter lengths aligning the dest address to 16 bytes either
50	   decreases performance or is irrelevant.  I'm making use of this
51	   comparison to skip the alignment in.  */
52	cmpldi	cr6,r5,256
53	bge	cr6,L(ge_256)
54	/* Account for the first 16-byte copy.  */
55	addi	r4,r4,16
56	addi	r11,r3,16	/* use r11 to keep dest address on r3.  */
57	subi	r5,r5,16
58	b	L(loop_head)
59
60	.p2align 5
61L(ge_256):
62	/* Account for the first copy <= 16 bytes.  This is necessary for
63	   memmove because at this point the src address can be in front of the
64	   dest address.  */
65	clrldi	r9,r5,56
66	li	r8,16
67	cmpldi	r9,16
68	iselgt	r9,r8,r9
69	add	r4,r4,r9
70	add	r11,r3,r9	/* use r11 to keep dest address on r3.  */
71	sub	r5,r5,r9
72
73	/* Align dest to 16 bytes.  */
74	neg	r7,r3
75	clrldi.	r9,r7,60
76	beq	L(loop_head)
77
78	.p2align 5
79	sldi	r6,r9,56
80	lxvl	32+v0,r4,r6
81	stxvl	32+v0,r11,r6
82	sub	r5,r5,r9
83	add	r4,r4,r9
84	add	r11,r11,r9
85
86L(loop_head):
87	cmpldi	r5,63
88	ble	L(final_64)
89
90	srdi.	r7,r5,7
91	beq	L(loop_tail)
92
93	mtctr	r7
94
95/* Main loop that copies 128 bytes each iteration.  */
96	.p2align 5
97L(loop):
98	addi	r9,r4,64
99	addi	r10,r11,64
100
101	lxv	32+v0,0(r4)
102	lxv	32+v1,16(r4)
103	lxv	32+v2,32(r4)
104	lxv	32+v3,48(r4)
105
106	stxv	32+v0,0(r11)
107	stxv	32+v1,16(r11)
108	stxv	32+v2,32(r11)
109	stxv	32+v3,48(r11)
110
111	addi	r4,r4,128
112	addi	r11,r11,128
113
114	lxv	32+v4,0(r9)
115	lxv	32+v5,16(r9)
116	lxv	32+v6,32(r9)
117	lxv	32+v7,48(r9)
118
119	stxv	32+v4,0(r10)
120	stxv	32+v5,16(r10)
121	stxv	32+v6,32(r10)
122	stxv	32+v7,48(r10)
123
124	bdnz	L(loop)
125	clrldi.	r5,r5,57
126	beqlr
127
128/* Copy 64 bytes.  */
129	.p2align 5
130L(loop_tail):
131	cmpldi 	cr5,r5,63
132	ble	cr5,L(final_64)
133
134	lxv	32+v0,0(r4)
135	lxv	32+v1,16(r4)
136	lxv	32+v2,32(r4)
137	lxv	32+v3,48(r4)
138
139	stxv	32+v0,0(r11)
140	stxv	32+v1,16(r11)
141	stxv	32+v2,32(r11)
142	stxv	32+v3,48(r11)
143
144	addi	r4,r4,64
145	addi	r11,r11,64
146	subi	r5,r5,64
147
148/* Copies the last 1-63 bytes.  */
149	.p2align 5
150L(final_64):
151	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
152	clrrdi.	r8,r5,4
153	beq	L(tail1)
154
155	cmpldi  cr5,r5,32
156	lxv	32+v0,0(r4)
157	blt	cr5,L(tail2)
158
159	cmpldi	cr6,r5,48
160	lxv	32+v1,16(r4)
161	blt	cr6,L(tail3)
162
163	.p2align 5
164	lxv	32+v2,32(r4)
165	stxv	32+v2,32(r11)
166L(tail3):
167	stxv	32+v1,16(r11)
168L(tail2):
169	stxv	32+v0,0(r11)
170	sub	r5,r5,r8
171	add	r4,r4,r8
172	add	r11,r11,r8
173	.p2align 5
174L(tail1):
175	sldi	r6,r5,56
176	lxvl	v4,r4,r6
177	stxvl	v4,r11,r6
178	blr
179
180/* If dest and src overlap, we should copy backwards.  */
181L(memmove_bwd):
182	add	r11,r3,r5
183	add	r4,r4,r5
184
185	/* Optimization for length smaller than 16 bytes.  */
186	cmpldi	cr5,r5,15
187	ble	cr5,L(tail1_bwd)
188
189	/* For shorter lengths the alignment either slows down or is irrelevant.
190	   The forward copy uses a already need 256 comparison for that.  Here
191	   it's using 128 as it will reduce code and improve readability.  */
192	cmpldi	cr7,r5,128
193	blt	cr7,L(bwd_loop_tail)
194
195	/* Align dest address to 16 bytes.  */
196	.p2align 5
197	clrldi.	r9,r11,60
198	beq	L(bwd_loop_head)
199	sub	r4,r4,r9
200	sub	r11,r11,r9
201	lxv	32+v0,0(r4)
202	sldi	r6,r9,56
203	stxvl   32+v0,r11,r6
204	sub	r5,r5,r9
205
206L(bwd_loop_head):
207	srdi.	r7,r5,7
208	beq	L(bwd_loop_tail)
209
210	mtctr	r7
211
212/* Main loop that copies 128 bytes every iteration.  */
213	.p2align 5
214L(bwd_loop):
215	addi	r9,r4,-64
216	addi	r10,r11,-64
217
218	lxv	32+v0,-16(r4)
219	lxv	32+v1,-32(r4)
220	lxv	32+v2,-48(r4)
221	lxv	32+v3,-64(r4)
222
223	stxv	32+v0,-16(r11)
224	stxv	32+v1,-32(r11)
225	stxv	32+v2,-48(r11)
226	stxv	32+v3,-64(r11)
227
228	addi	r4,r4,-128
229	addi	r11,r11,-128
230
231	lxv	32+v0,-16(r9)
232	lxv	32+v1,-32(r9)
233	lxv	32+v2,-48(r9)
234	lxv	32+v3,-64(r9)
235
236	stxv	32+v0,-16(r10)
237	stxv	32+v1,-32(r10)
238	stxv	32+v2,-48(r10)
239	stxv	32+v3,-64(r10)
240
241	bdnz	L(bwd_loop)
242	clrldi.	r5,r5,57
243	beqlr
244
245/* Copy 64 bytes.  */
246	.p2align 5
247L(bwd_loop_tail):
248	cmpldi 	cr5,r5,63
249	ble	cr5,L(bwd_final_64)
250
251	addi	r4,r4,-64
252	addi	r11,r11,-64
253
254	lxv	32+v0,0(r4)
255	lxv	32+v1,16(r4)
256	lxv	32+v2,32(r4)
257	lxv	32+v3,48(r4)
258
259	stxv	32+v0,0(r11)
260	stxv	32+v1,16(r11)
261	stxv	32+v2,32(r11)
262	stxv	32+v3,48(r11)
263
264	subi	r5,r5,64
265
266/* Copies the last 1-63 bytes.  */
267	.p2align 5
268L(bwd_final_64):
269	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
270	clrrdi.	r8,r5,4
271	beq	L(tail1_bwd)
272
273	cmpldi	cr5,r5,32
274	lxv	32+v2,-16(r4)
275	blt	cr5,L(tail2_bwd)
276
277	cmpldi	cr6,r5,48
278	lxv	32+v1,-32(r4)
279	blt	cr6,L(tail3_bwd)
280
281	.p2align 5
282	lxv	32+v0,-48(r4)
283	stxv	32+v0,-48(r11)
284L(tail3_bwd):
285	stxv	32+v1,-32(r11)
286L(tail2_bwd):
287	stxv	32+v2,-16(r11)
288	sub	r4,r4,r5
289	sub	r11,r11,r5
290	sub	r5,r5,r8
291	sldi	r6,r5,56
292	lxvl	v4,r4,r6
293	stxvl	v4,r11,r6
294	blr
295
296/* Copy last 16 bytes.  */
297	.p2align 5
298L(tail1_bwd):
299	sub	r4,r4,r5
300	sub	r11,r11,r5
301	sldi	r6,r5,56
302	lxvl	v4,r4,r6
303	stxvl	v4,r11,r6
304	blr
305
306END_GEN_TB (MEMMOVE,TB_TOCLESS)
307libc_hidden_builtin_def (memmove)
308