1/* Optimized memcpy implementation for PowerPC64/POWER7.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21
22/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
23   Returns 'dst'.  */
24
25#ifndef MEMCPY
26# define MEMCPY memcpy
27#endif
28
29#define dst 11		/* Use r11 so r3 kept unchanged.  */
30#define src 4
31#define cnt 5
32
33	.machine power7
34ENTRY_TOCLESS (MEMCPY, 5)
35	CALL_MCOUNT 3
36
37	cmpldi	cr1,cnt,31
38	neg	0,3
39	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
40				    code.  */
41
42/* Align copies using VSX instructions to quadword. It is to avoid alignment
43   traps when memcpy is used on non-cacheable memory (for instance, memory
44   mapped I/O).  */
45	andi.	10,3,15
46	clrldi	11,4,60
47	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
48
49	mr	dst,3
50	bne	cr6,L(copy_GE_32_unaligned)
51	beq	L(aligned_copy)
52
53	mtocrf	0x01,0
54	clrldi	0,0,60
55
56/* Get the DST and SRC aligned to 16 bytes.  */
571:
58	bf	31,2f
59	lbz	6,0(src)
60	addi	src,src,1
61	stb	6,0(dst)
62	addi	dst,dst,1
632:
64	bf	30,4f
65	lhz	6,0(src)
66	addi	src,src,2
67	sth	6,0(dst)
68	addi	dst,dst,2
694:
70	bf	29,8f
71	lwz	6,0(src)
72	addi	src,src,4
73	stw	6,0(dst)
74	addi	dst,dst,4
758:
76	bf	28,16f
77	ld	6,0(src)
78	addi	src,src,8
79	std	6,0(dst)
80	addi	dst,dst,8
8116:
82	subf	cnt,0,cnt
83
84/* Main aligned copy loop. Copies 128 bytes at a time. */
85L(aligned_copy):
86	li	6,16
87	li	7,32
88	li	8,48
89	mtocrf	0x02,cnt
90	srdi	12,cnt,7
91	cmpdi	12,0
92	beq	L(aligned_tail)
93	lvx	6,0,src
94	lvx	7,src,6
95	mtctr	12
96	b	L(aligned_128loop)
97
98	.align  4
99L(aligned_128head):
100	/* for the 2nd + iteration of this loop. */
101	lvx	6,0,src
102	lvx	7,src,6
103L(aligned_128loop):
104	lvx	8,src,7
105	lvx	9,src,8
106	stvx	6,0,dst
107	addi	src,src,64
108	stvx	7,dst,6
109	stvx	8,dst,7
110	stvx	9,dst,8
111	lvx	6,0,src
112	lvx	7,src,6
113	addi	dst,dst,64
114	lvx	8,src,7
115	lvx	9,src,8
116	addi	src,src,64
117	stvx	6,0,dst
118	stvx	7,dst,6
119	stvx	8,dst,7
120	stvx	9,dst,8
121	addi	dst,dst,64
122	bdnz	L(aligned_128head)
123
124L(aligned_tail):
125	mtocrf	0x01,cnt
126	bf	25,32f
127	lvx	6,0,src
128	lvx	7,src,6
129	lvx	8,src,7
130	lvx	9,src,8
131	addi	src,src,64
132	stvx	6,0,dst
133	stvx	7,dst,6
134	stvx	8,dst,7
135	stvx	9,dst,8
136	addi	dst,dst,64
13732:
138	bf	26,16f
139	lvx	6,0,src
140	lvx	7,src,6
141	addi	src,src,32
142	stvx	6,0,dst
143	stvx	7,dst,6
144	addi	dst,dst,32
14516:
146	bf	27,8f
147	lvx	6,0,src
148	addi	src,src,16
149	stvx	6,0,dst
150	addi	dst,dst,16
1518:
152	bf	28,4f
153	ld	6,0(src)
154	addi	src,src,8
155	std     6,0(dst)
156	addi	dst,dst,8
1574:	/* Copies 4~7 bytes.  */
158	bf	29,L(tail2)
159	lwz	6,0(src)
160	stw     6,0(dst)
161	bf      30,L(tail5)
162	lhz     7,4(src)
163	sth     7,4(dst)
164	bflr	31
165	lbz     8,6(src)
166	stb     8,6(dst)
167	/* Return original DST pointer.  */
168	blr
169
170
171/* Handle copies of 0~31 bytes.  */
172	.align	4
173L(copy_LT_32):
174	mr	dst,3
175	cmpldi	cr6,cnt,8
176	mtocrf	0x01,cnt
177	ble	cr6,L(copy_LE_8)
178
179	/* At least 9 bytes to go.  */
180	neg	8,4
181	andi.	0,8,3
182	cmpldi	cr1,cnt,16
183	beq	L(copy_LT_32_aligned)
184
185	/* Force 4-byte alignment for SRC.  */
186	mtocrf	0x01,0
187	subf	cnt,0,cnt
1882:
189	bf	30,1f
190	lhz	6,0(src)
191	addi	src,src,2
192	sth	6,0(dst)
193	addi	dst,dst,2
1941:
195	bf	31,L(end_4bytes_alignment)
196	lbz	6,0(src)
197	addi	src,src,1
198	stb	6,0(dst)
199	addi	dst,dst,1
200
201	.align	4
202L(end_4bytes_alignment):
203	cmpldi	cr1,cnt,16
204	mtocrf	0x01,cnt
205
206L(copy_LT_32_aligned):
207	/* At least 6 bytes to go, and SRC is word-aligned.  */
208	blt	cr1,8f
209
210	/* Copy 16 bytes.  */
211	lwz	6,0(src)
212	lwz	7,4(src)
213	stw	6,0(dst)
214	lwz	8,8(src)
215	stw	7,4(dst)
216	lwz	6,12(src)
217	addi	src,src,16
218	stw	8,8(dst)
219	stw	6,12(dst)
220	addi	dst,dst,16
2218:	/* Copy 8 bytes.  */
222	bf	28,L(tail4)
223	lwz	6,0(src)
224	lwz	7,4(src)
225	addi	src,src,8
226	stw	6,0(dst)
227	stw	7,4(dst)
228	addi	dst,dst,8
229
230	.align	4
231/* Copies 4~7 bytes.  */
232L(tail4):
233	bf	29,L(tail2)
234	lwz	6,0(src)
235	stw	6,0(dst)
236	bf	30,L(tail5)
237	lhz	7,4(src)
238	sth	7,4(dst)
239	bflr	31
240	lbz	8,6(src)
241	stb	8,6(dst)
242	/* Return original DST pointer.  */
243	blr
244
245	.align	4
246/* Copies 2~3 bytes.  */
247L(tail2):
248	bf	30,1f
249	lhz	6,0(src)
250	sth	6,0(dst)
251	bflr	31
252	lbz	7,2(src)
253	stb	7,2(dst)
254	blr
255
256	.align	4
257L(tail5):
258	bflr	31
259	lbz	6,4(src)
260	stb	6,4(dst)
261	blr
262
263	.align	4
2641:
265	bflr	31
266	lbz	6,0(src)
267	stb	6,0(dst)
268	/* Return original DST pointer.  */
269	blr
270
271
272/* Handles copies of 0~8 bytes.  */
273	.align	4
274L(copy_LE_8):
275	bne	cr6,L(tail4)
276
277	/* Though we could've used ld/std here, they are still
278	slow for unaligned cases.  */
279
280	lwz	6,0(src)
281	lwz	7,4(src)
282	stw	6,0(dst)
283	stw	7,4(dst)
284	blr
285
286
287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
288   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
289   the data, allowing for aligned DST stores.  */
290	.align	4
291L(copy_GE_32_unaligned):
292	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
293	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
294
295	beq	L(copy_GE_32_unaligned_cont)
296
297	/* DST is not quadword aligned, get it aligned.  */
298
299	mtocrf	0x01,0
300	subf	cnt,0,cnt
301
302	/* Vector instructions work best when proper alignment (16-bytes)
303	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
3041:
305	bf	31,2f
306	lbz	6,0(src)
307	addi	src,src,1
308	stb	6,0(dst)
309	addi	dst,dst,1
3102:
311	bf	30,4f
312	lhz	6,0(src)
313	addi	src,src,2
314	sth	6,0(dst)
315	addi	dst,dst,2
3164:
317	bf	29,8f
318	lwz	6,0(src)
319	addi	src,src,4
320	stw	6,0(dst)
321	addi	dst,dst,4
3228:
323	bf	28,0f
324	ld	6,0(src)
325	addi	src,src,8
326	std	6,0(dst)
327	addi	dst,dst,8
3280:
329	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
330
331	/* The proper alignment is present, it is OK to copy the bytes now.  */
332L(copy_GE_32_unaligned_cont):
333
334	/* Setup two indexes to speed up the indexed vector operations.  */
335	clrldi	10,cnt,60
336	li	6,16	      /* Index for 16-bytes offsets.  */
337	li	7,32	      /* Index for 32-bytes offsets.  */
338	cmpldi	cr1,10,0
339	srdi	8,cnt,5	      /* Setup the loop counter.  */
340	mtocrf	0x01,9
341	cmpldi	cr6,9,1
342#ifdef __LITTLE_ENDIAN__
343	lvsr	5,0,src
344#else
345	lvsl	5,0,src
346#endif
347	lvx	3,0,src
348	li	0,0
349	bf	31,L(setup_unaligned_loop)
350
351	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
352	lvx	4,src,6
353#ifdef __LITTLE_ENDIAN__
354	vperm	6,4,3,5
355#else
356	vperm	6,3,4,5
357#endif
358	addi	src,src,16
359	stvx	6,0,dst
360	addi	dst,dst,16
361	vor	3,4,4
362	clrrdi	0,src,60
363
364L(setup_unaligned_loop):
365	mtctr	8
366	ble	cr6,L(end_unaligned_loop)
367
368	/* Copy 32 bytes at a time using vector instructions.  */
369	.align	4
370L(unaligned_loop):
371
372	/* Note: vr6/vr10 may contain data that was already copied,
373	but in order to get proper alignment, we may have to copy
374	some portions again. This is faster than having unaligned
375	vector instructions though.  */
376
377	lvx	4,src,6
378#ifdef __LITTLE_ENDIAN__
379	vperm	6,4,3,5
380#else
381	vperm	6,3,4,5
382#endif
383	lvx	3,src,7
384#ifdef __LITTLE_ENDIAN__
385	vperm	10,3,4,5
386#else
387	vperm	10,4,3,5
388#endif
389	addi	src,src,32
390	stvx	6,0,dst
391	stvx	10,dst,6
392	addi	dst,dst,32
393	bdnz	L(unaligned_loop)
394
395	clrrdi	0,src,60
396
397	.align	4
398L(end_unaligned_loop):
399
400	/* Check for tail bytes.  */
401	mtocrf	0x01,cnt
402	beqlr	cr1
403
404	add	src,src,0
405
406	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
407	/* Copy 8 bytes.  */
408	bf	28,4f
409	lwz	6,0(src)
410	lwz	7,4(src)
411	addi	src,src,8
412	stw	6,0(dst)
413	stw	7,4(dst)
414	addi	dst,dst,8
4154:	/* Copy 4~7 bytes.  */
416	bf	29,L(tail2)
417	lwz	6,0(src)
418	stw	6,0(dst)
419	bf	30,L(tail5)
420	lhz	7,4(src)
421	sth	7,4(dst)
422	bflr	31
423	lbz	8,6(src)
424	stb	8,6(dst)
425	/* Return original DST pointer.  */
426	blr
427
428END_GEN_TB (MEMCPY,TB_TOCLESS)
429libc_hidden_builtin_def (memcpy)
430