1/* Optimized memcpy implementation for PowerPC32/POWER7.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.  */
23
24	.machine  power7
25EALIGN (memcpy, 5, 0)
26	CALL_MCOUNT
27
28	stwu    1,-32(1)
29	cfi_adjust_cfa_offset(32)
30	stw	30,20(1)
31	cfi_offset(30,(20-32))
32	stw	31,24(1)
33	mr      30,3
34	cmplwi  cr1,5,31
35	neg	0,3
36	cfi_offset(31,-8)
37	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
38				    code.  */
39
40	andi.   11,3,15	      /* Check alignment of DST.  */
41	clrlwi  10,4,28	      /* Check alignment of SRC.  */
42	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
43	mr	12,4
44	mr	31,5
45	bne	cr6,L(copy_GE_32_unaligned)
46
47	srwi    9,5,3	      /* Number of full quadwords remaining.  */
48
49	beq	L(copy_GE_32_aligned_cont)
50
51	clrlwi  0,0,29
52	mtcrf   0x01,0
53	subf    31,0,5
54
55	/* Get the SRC aligned to 8 bytes.  */
56
571:	bf	31,2f
58	lbz	6,0(12)
59	addi    12,12,1
60	stb	6,0(3)
61	addi    3,3,1
622:	bf      30,4f
63	lhz     6,0(12)
64	addi    12,12,2
65	sth     6,0(3)
66	addi    3,3,2
674:	bf      29,0f
68	lwz     6,0(12)
69	addi    12,12,4
70	stw     6,0(3)
71	addi    3,3,4
720:
73	clrlwi  10,12,29      /* Check alignment of SRC again.  */
74	srwi    9,31,3	      /* Number of full doublewords remaining.  */
75
76L(copy_GE_32_aligned_cont):
77
78	clrlwi  11,31,29
79	mtcrf   0x01,9
80
81	srwi    8,31,5
82	cmplwi  cr1,9,4
83	cmplwi  cr6,11,0
84	mr	11,12
85
86	/* Copy 1~3 doublewords so the main loop starts
87	at a multiple of 32 bytes.  */
88
89	bf	30,1f
90	lfd     6,0(12)
91	lfd     7,8(12)
92	addi    11,12,16
93	mtctr   8
94	stfd    6,0(3)
95	stfd    7,8(3)
96	addi    10,3,16
97	bf      31,4f
98	lfd     0,16(12)
99	stfd    0,16(3)
100	blt     cr1,3f
101	addi    11,12,24
102	addi    10,3,24
103	b       4f
104
105	.align  4
1061:	/* Copy 1 doubleword and set the counter.  */
107	mr	10,3
108	mtctr   8
109	bf      31,4f
110	lfd     6,0(12)
111	addi    11,12,8
112	stfd    6,0(3)
113	addi    10,3,8
114
115L(aligned_copy):
116	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
117	.align  4
1184:
119	/* check for any 32-byte or 64-byte lumps that are outside of a
120	   nice 128-byte range.  R8 contains the number of 32-byte
121	   lumps, so drop this into the CR, and use the SO/EQ bits to help
122	   handle the 32- or 64- byte lumps.  Then handle the rest with an
123	   unrolled 128-bytes-at-a-time copy loop. */
124	mtocrf	1,8
125	li	6,16	# 16() index
126	li	7,32	# 32() index
127	li	8,48	# 48() index
128
129L(aligned_32byte):
130	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
131	bns	cr7,L(aligned_64byte)
132	lxvd2x	6,0,11
133	lxvd2x	7,11,6
134	addi	11,11,32
135	stxvd2x	6,0,10
136	stxvd2x	7,10,6
137	addi	10,10,32
138
139L(aligned_64byte):
140	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
141	bne	cr7,L(aligned_128setup)
142	lxvd2x	6,0,11
143	lxvd2x	7,11,6
144	lxvd2x	8,11,7
145	lxvd2x	9,11,8
146	addi	11,11,64
147	stxvd2x	6,0,10
148	stxvd2x	7,10,6
149	stxvd2x	8,10,7
150	stxvd2x	9,10,8
151	addi	10,10,64
152
153L(aligned_128setup):
154	/* Set up for the 128-byte at a time copy loop.  */
155	srwi	8,31,7
156	cmpwi	8,0	# Any 4x lumps left?
157	beq	3f	# if not, move along.
158	lxvd2x	6,0,11
159	lxvd2x	7,11,6
160	mtctr	8	# otherwise, load the ctr and begin.
161	li	8,48	# 48() index
162	b	L(aligned_128loop)
163
164L(aligned_128head):
165	/* for the 2nd + iteration of this loop. */
166	lxvd2x	6,0,11
167	lxvd2x	7,11,6
168L(aligned_128loop):
169	lxvd2x	8,11,7
170	lxvd2x	9,11,8
171	stxvd2x	6,0,10
172	addi	11,11,64
173	stxvd2x	7,10,6
174	stxvd2x	8,10,7
175	stxvd2x	9,10,8
176	lxvd2x	6,0,11
177	lxvd2x	7,11,6
178	addi	10,10,64
179	lxvd2x	8,11,7
180	lxvd2x	9,11,8
181	addi	11,11,64
182	stxvd2x	6,0,10
183	stxvd2x	7,10,6
184	stxvd2x	8,10,7
185	stxvd2x	9,10,8
186	addi	10,10,64
187	bdnz	L(aligned_128head)
188
1893:
190	/* Check for tail bytes.  */
191	clrrwi  0,31,3
192	mtcrf   0x01,31
193	beq	cr6,0f
194
195.L9:
196	add	3,3,0
197	add	12,12,0
198
199	/*  At this point we have a tail of 0-7 bytes and we know that the
200	destination is doubleword-aligned.  */
2014:	/* Copy 4 bytes.  */
202	bf	29,2f
203
204	lwz     6,0(12)
205	addi    12,12,4
206	stw     6,0(3)
207	addi    3,3,4
2082:	/* Copy 2 bytes.  */
209	bf	30,1f
210
211	lhz     6,0(12)
212	addi    12,12,2
213	sth     6,0(3)
214	addi    3,3,2
2151:	/* Copy 1 byte.  */
216	bf	31,0f
217
218	lbz	6,0(12)
219	stb	6,0(3)
2200:	/* Return original DST pointer.  */
221	mr	3,30
222	lwz	30,20(1)
223	lwz     31,24(1)
224	addi    1,1,32
225	blr
226
227	/* Handle copies of 0~31 bytes.  */
228	.align  4
229L(copy_LT_32):
230	cmplwi  cr6,5,8
231	mr	12,4
232	mtcrf   0x01,5
233	ble	cr6,L(copy_LE_8)
234
235	/* At least 9 bytes to go.  */
236	neg	8,4
237	clrrwi  11,4,2
238	andi.   0,8,3
239	cmplwi  cr1,5,16
240	mr	10,5
241	beq	L(copy_LT_32_aligned)
242
243	/* Force 4-bytes alignment for SRC.  */
244	mtocrf  0x01,0
245	subf    10,0,5
2462:	bf	30,1f
247
248	lhz	6,0(12)
249	addi    12,12,2
250	sth	6,0(3)
251	addi    3,3,2
2521:	bf	31,L(end_4bytes_alignment)
253
254	lbz	6,0(12)
255	addi    12,12,1
256	stb	6,0(3)
257	addi    3,3,1
258
259	.align  4
260L(end_4bytes_alignment):
261	cmplwi  cr1,10,16
262	mtcrf   0x01,10
263
264L(copy_LT_32_aligned):
265	/* At least 6 bytes to go, and SRC is word-aligned.  */
266	blt	cr1,8f
267
268	/* Copy 16 bytes.  */
269	lwz	6,0(12)
270	lwz     7,4(12)
271	stw     6,0(3)
272	lwz     8,8(12)
273	stw     7,4(3)
274	lwz     6,12(12)
275	addi    12,12,16
276	stw     8,8(3)
277	stw     6,12(3)
278	addi    3,3,16
2798:	/* Copy 8 bytes.  */
280	bf	28,4f
281
282	lwz     6,0(12)
283	lwz     7,4(12)
284	addi    12,12,8
285	stw     6,0(3)
286	stw     7,4(3)
287	addi    3,3,8
2884:	/* Copy 4 bytes.  */
289	bf	29,2f
290
291	lwz     6,0(12)
292	addi    12,12,4
293	stw     6,0(3)
294	addi    3,3,4
2952:	/* Copy 2-3 bytes.  */
296	bf	30,1f
297
298	lhz     6,0(12)
299	sth     6,0(3)
300	bf      31,0f
301	lbz     7,2(12)
302	stb     7,2(3)
303
304	/* Return original DST pointer.  */
305	mr      3,30
306	lwz     30,20(1)
307	addi    1,1,32
308	blr
309
310	.align  4
3111:	/* Copy 1 byte.  */
312	bf	31,0f
313
314	lbz	6,0(12)
315	stb	6,0(3)
3160:	/* Return original DST pointer.  */
317	mr	3,30
318	lwz	30,20(1)
319	addi    1,1,32
320	blr
321
322	/* Handles copies of 0~8 bytes.  */
323	.align  4
324L(copy_LE_8):
325	bne	cr6,4f
326
327	/* Though we could've used lfd/stfd here, they are still
328	slow for unaligned cases.  */
329
330	lwz	6,0(4)
331	lwz     7,4(4)
332	stw     6,0(3)
333	stw     7,4(3)
334
335	/* Return original DST pointer.  */
336	mr      3,30
337	lwz     30,20(1)
338	addi    1,1,32
339	blr
340
341	.align  4
3424:	/* Copies 4~7 bytes.  */
343	bf	29,2b
344
345	lwz	6,0(4)
346	stw     6,0(3)
347	bf      30,5f
348	lhz     7,4(4)
349	sth     7,4(3)
350	bf      31,0f
351	lbz     8,6(4)
352	stb     8,6(3)
353
354	/* Return original DST pointer.  */
355	mr      3,30
356	lwz     30,20(1)
357	addi    1,1,32
358	blr
359
360	.align  4
3615:	/* Copy 1 byte.  */
362	bf	31,0f
363
364	lbz	6,4(4)
365	stb	6,4(3)
366
3670:	/* Return original DST pointer.  */
368	mr	3,30
369	lwz     30,20(1)
370	addi    1,1,32
371	blr
372
373	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
374	SRC is not. Use aligned quadword loads from SRC, shifted to realign
375	the data, allowing for aligned DST stores.  */
376	.align  4
377L(copy_GE_32_unaligned):
378	andi.   11,3,15	      /* Check alignment of DST.  */
379	clrlwi  0,0,28	      /* Number of bytes until the 1st
380			      quadword of DST.  */
381	srwi    9,5,4	      /* Number of full quadwords remaining.  */
382
383	beq    L(copy_GE_32_unaligned_cont)
384
385	/* DST is not quadword aligned, get it aligned.  */
386
387	mtcrf   0x01,0
388	subf    31,0,5
389
390	/* Vector instructions work best when proper alignment (16-bytes)
391	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
3921:	/* Copy 1 byte.  */
393	bf	31,2f
394
395	lbz	6,0(12)
396	addi    12,12,1
397	stb	6,0(3)
398	addi    3,3,1
3992:	/* Copy 2 bytes.  */
400	bf	    30,4f
401
402	lhz     6,0(12)
403	addi    12,12,2
404	sth     6,0(3)
405	addi    3,3,2
4064:	/* Copy 4 bytes.  */
407	bf	29,8f
408
409	lwz     6,0(12)
410	addi    12,12,4
411	stw     6,0(3)
412	addi    3,3,4
4138:	/* Copy 8 bytes.  */
414	bf	28,0f
415
416	lfd	6,0(12)
417	addi    12,12,8
418	stfd    6,0(3)
419	addi    3,3,8
4200:
421	clrlwi  10,12,28      /* Check alignment of SRC.  */
422	srwi    9,31,4	      /* Number of full quadwords remaining.  */
423
424	/* The proper alignment is present, it is OK to copy the bytes now.  */
425L(copy_GE_32_unaligned_cont):
426
427	/* Setup two indexes to speed up the indexed vector operations.  */
428	clrlwi  11,31,28
429	li      6,16	      /* Index for 16-bytes offsets.  */
430	li	7,32	      /* Index for 32-bytes offsets.  */
431	cmplwi  cr1,11,0
432	srwi    8,31,5	      /* Setup the loop counter.  */
433	mr      10,3
434	mr      11,12
435	mtcrf   0x01,9
436	cmplwi  cr6,9,1
437#ifdef __LITTLE_ENDIAN__
438	lvsr    5,0,12
439#else
440	lvsl    5,0,12
441#endif
442	lvx     3,0,12
443	bf      31,L(setup_unaligned_loop)
444
445	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
446	lvx     4,12,6
447#ifdef __LITTLE_ENDIAN__
448	vperm   6,4,3,5
449#else
450	vperm   6,3,4,5
451#endif
452	addi    11,12,16
453	addi    10,3,16
454	stvx    6,0,3
455	vor	3,4,4
456
457L(setup_unaligned_loop):
458	mtctr   8
459	ble     cr6,L(end_unaligned_loop)
460
461	/* Copy 32 bytes at a time using vector instructions.  */
462	.align  4
463L(unaligned_loop):
464
465	/* Note: vr6/vr10 may contain data that was already copied,
466	but in order to get proper alignment, we may have to copy
467	some portions again. This is faster than having unaligned
468	vector instructions though.  */
469
470	lvx	4,11,6	      /* vr4 = r11+16.  */
471#ifdef __LITTLE_ENDIAN__
472	vperm   6,4,3,5
473#else
474	vperm   6,3,4,5
475#endif
476	lvx	3,11,7	      /* vr3 = r11+32.  */
477#ifdef __LITTLE_ENDIAN__
478	vperm   10,3,4,5
479#else
480	vperm   10,4,3,5
481#endif
482	addi    11,11,32
483	stvx    6,0,10
484	stvx    10,10,6
485	addi    10,10,32
486
487	bdnz    L(unaligned_loop)
488
489	.align  4
490L(end_unaligned_loop):
491
492	/* Check for tail bytes.  */
493	clrrwi  0,31,4
494	mtcrf   0x01,31
495	beq	cr1,0f
496
497	add	3,3,0
498	add	12,12,0
499
500	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
5018:	/* Copy 8 bytes.  */
502	bf	28,4f
503
504	lwz	6,0(12)
505	lwz	7,4(12)
506	addi    12,12,8
507	stw	6,0(3)
508	stw	7,4(3)
509	addi    3,3,8
5104:	/* Copy 4 bytes.  */
511	bf	29,2f
512
513	lwz	6,0(12)
514	addi    12,12,4
515	stw	6,0(3)
516	addi    3,3,4
5172:	/* Copy 2~3 bytes.  */
518	bf	30,1f
519
520	lhz	6,0(12)
521	addi    12,12,2
522	sth	6,0(3)
523	addi    3,3,2
5241:	/* Copy 1 byte.  */
525	bf	31,0f
526
527	lbz	6,0(12)
528	stb	6,0(3)
5290:	/* Return original DST pointer.  */
530	mr	3,30
531	lwz     30,20(1)
532	lwz	31,24(1)
533	addi    1,1,32
534	blr
535
536END (memcpy)
537libc_hidden_builtin_def (memcpy)
538