1/* Optimized memmove implementation for PowerPC64/POWER7.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21
22/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
23
24   This optimization check if memory 'dest'  overlaps with 'src'. If it does
25   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
26   embedded here to gain some cycles).
27   If source and destiny overlaps, a optimized backwards memcpy is used
28   instead.  */
29
30#ifndef MEMMOVE
31# define MEMMOVE memmove
32#endif
33	.machine power7
34ENTRY_TOCLESS (MEMMOVE, 5)
35	CALL_MCOUNT 3
36
37L(_memmove):
38	subf    r9,r4,r3
39	cmpld   cr7,r9,r5
40	blt	cr7,L(memmove_bwd)
41
42	cmpldi	cr1,r5,31
43	neg	0,3
44	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
45				       code.  */
46
47	andi.	10,3,15
48	clrldi	11,4,60
49	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
50
51	mr	r11,3
52	bne	cr6,L(copy_GE_32_unaligned)
53	beq	L(aligned_copy)
54
55	mtocrf	0x01,0
56	clrldi	0,0,60
57
58/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
591:
60	bf	31,2f
61	lbz	6,0(r4)
62	addi	r4,r4,1
63	stb	6,0(r11)
64	addi	r11,r11,1
652:
66	bf	30,4f
67	lhz	6,0(r4)
68	addi	r4,r4,2
69	sth	6,0(r11)
70	addi	r11,r11,2
714:
72	bf	29,8f
73	lwz	6,0(r4)
74	addi	r4,r4,4
75	stw	6,0(r11)
76	addi	r11,r11,4
778:
78	bf	28,16f
79	ld	6,0(r4)
80	addi	r4,r4,8
81	std	6,0(r11)
82	addi	r11,r11,8
8316:
84	subf	r5,0,r5
85
86/* Main aligned copy loop. Copies 128 bytes at a time. */
87L(aligned_copy):
88	li	6,16
89	li	7,32
90	li	8,48
91	mtocrf	0x02,r5
92	srdi	12,r5,7
93	cmpdi	12,0
94	beq	L(aligned_tail)
95	lvx	6,0,r4
96	lvx	7,r4,6
97	mtctr	12
98	b	L(aligned_128loop)
99
100	.align  4
101L(aligned_128head):
102	/* for the 2nd + iteration of this loop. */
103	lvx	6,0,r4
104	lvx	7,r4,6
105L(aligned_128loop):
106	lvx	8,r4,7
107	lvx	9,r4,8
108	stvx	6,0,r11
109	addi	r4,r4,64
110	stvx	7,r11,6
111	stvx	8,r11,7
112	stvx	9,r11,8
113	lvx	6,0,r4
114	lvx	7,r4,6
115	addi	r11,r11,64
116	lvx	8,r4,7
117	lvx	9,r4,8
118	addi	r4,r4,64
119	stvx	6,0,r11
120	stvx	7,r11,6
121	stvx	8,r11,7
122	stvx	9,r11,8
123	addi	r11,r11,64
124	bdnz	L(aligned_128head)
125
126L(aligned_tail):
127	mtocrf	0x01,r5
128	bf	25,32f
129	lvx	6,0,r4
130	lvx	7,r4,6
131	lvx	8,r4,7
132	lvx	9,r4,8
133	addi	r4,r4,64
134	stvx	6,0,r11
135	stvx	7,r11,6
136	stvx	8,r11,7
137	stvx	9,r11,8
138	addi	r11,r11,64
13932:
140	bf	26,16f
141	lvx	6,0,r4
142	lvx	7,r4,6
143	addi	r4,r4,32
144	stvx	6,0,r11
145	stvx	7,r11,6
146	addi	r11,r11,32
14716:
148	bf	27,8f
149	lvx	6,0,r4
150	addi	r4,r4,16
151	stvx	6,0,r11
152	addi	r11,r11,16
1538:
154	bf	28,4f
155	ld	6,0(r4)
156	addi	r4,r4,8
157	std     6,0(r11)
158	addi	r11,r11,8
1594:	/* Copies 4~7 bytes.  */
160	bf	29,L(tail2)
161	lwz	6,0(r4)
162	stw     6,0(r11)
163	bf      30,L(tail5)
164	lhz     7,4(r4)
165	sth     7,4(r11)
166	bflr	31
167	lbz     8,6(r4)
168	stb     8,6(r11)
169	/* Return original DST pointer.  */
170	blr
171
172/* Handle copies of 0~31 bytes.  */
173	.align	4
174L(copy_LT_32):
175	mr	r11,3
176	cmpldi	cr6,r5,8
177	mtocrf	0x01,r5
178	ble	cr6,L(copy_LE_8)
179
180	/* At least 9 bytes to go.  */
181	neg	8,4
182	andi.	0,8,3
183	cmpldi	cr1,r5,16
184	beq	L(copy_LT_32_aligned)
185
186	/* Force 4-byte alignment for SRC.  */
187	mtocrf	0x01,0
188	subf	r5,0,r5
1892:
190	bf	30,1f
191	lhz	6,0(r4)
192	addi	r4,r4,2
193	sth	6,0(r11)
194	addi	r11,r11,2
1951:
196	bf	31,L(end_4bytes_alignment)
197	lbz	6,0(r4)
198	addi	r4,r4,1
199	stb	6,0(r11)
200	addi	r11,r11,1
201
202	.align	4
203L(end_4bytes_alignment):
204	cmpldi	cr1,r5,16
205	mtocrf	0x01,r5
206
207L(copy_LT_32_aligned):
208	/* At least 6 bytes to go, and SRC is word-aligned.  */
209	blt	cr1,8f
210
211	/* Copy 16 bytes.  */
212	lwz	6,0(r4)
213	lwz	7,4(r4)
214	stw	6,0(r11)
215	lwz	8,8(r4)
216	stw	7,4(r11)
217	lwz	6,12(r4)
218	addi	r4,r4,16
219	stw	8,8(r11)
220	stw	6,12(r11)
221	addi	r11,r11,16
2228:	/* Copy 8 bytes.  */
223	bf	28,L(tail4)
224	lwz	6,0(r4)
225	lwz	7,4(r4)
226	addi	r4,r4,8
227	stw	6,0(r11)
228	stw	7,4(r11)
229	addi	r11,r11,8
230
231	.align	4
232/* Copies 4~7 bytes.  */
233L(tail4):
234	bf	29,L(tail2)
235	lwz	6,0(r4)
236	stw	6,0(r11)
237	bf	30,L(tail5)
238	lhz	7,4(r4)
239	sth	7,4(r11)
240	bflr	31
241	lbz	8,6(r4)
242	stb	8,6(r11)
243	/* Return original DST pointer.  */
244	blr
245
246	.align	4
247/* Copies 2~3 bytes.  */
248L(tail2):
249	bf	30,1f
250	lhz	6,0(r4)
251	sth	6,0(r11)
252	bflr	31
253	lbz	7,2(r4)
254	stb	7,2(r11)
255	blr
256
257	.align	4
258L(tail5):
259	bflr	31
260	lbz	6,4(r4)
261	stb	6,4(r11)
262	blr
263
264	.align	4
2651:
266	bflr	31
267	lbz	6,0(r4)
268	stb	6,0(r11)
269	/* Return original DST pointer.  */
270	blr
271
272/* Handles copies of 0~8 bytes.  */
273	.align	4
274L(copy_LE_8):
275	bne	cr6,L(tail4)
276
277	/* Though we could've used ld/std here, they are still
278	slow for unaligned cases.  */
279
280	lwz	6,0(r4)
281	lwz	7,4(r4)
282	stw	6,0(r11)
283	stw	7,4(r11)
284	blr
285
286
287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
288   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
289   the data, allowing for aligned DST stores.  */
290	.align	4
291L(copy_GE_32_unaligned):
292	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
293	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
294
295	beq	L(copy_GE_32_unaligned_cont)
296
297	/* DST is not quadword aligned, get it aligned.  */
298
299	mtocrf	0x01,0
300	subf	r5,0,r5
301
302	/* Vector instructions work best when proper alignment (16-bytes)
303	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
3041:
305	bf	31,2f
306	lbz	6,0(r4)
307	addi	r4,r4,1
308	stb	6,0(r11)
309	addi	r11,r11,1
3102:
311	bf	30,4f
312	lhz	6,0(r4)
313	addi	r4,r4,2
314	sth	6,0(r11)
315	addi	r11,r11,2
3164:
317	bf	29,8f
318	lwz	6,0(r4)
319	addi	r4,r4,4
320	stw	6,0(r11)
321	addi	r11,r11,4
3228:
323	bf	28,0f
324	ld	6,0(r4)
325	addi	r4,r4,8
326	std	6,0(r11)
327	addi	r11,r11,8
3280:
329	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
330
331	/* The proper alignment is present, it is OK to copy the bytes now.  */
332L(copy_GE_32_unaligned_cont):
333
334	/* Setup two indexes to speed up the indexed vector operations.  */
335	clrldi	10,r5,60
336	li	6,16	      /* Index for 16-bytes offsets.  */
337	li	7,32	      /* Index for 32-bytes offsets.  */
338	cmpldi	cr1,10,0
339	srdi	8,r5,5	      /* Setup the loop counter.  */
340	mtocrf	0x01,9
341	cmpldi	cr6,9,1
342#ifdef __LITTLE_ENDIAN__
343	lvsr	5,0,r4
344#else
345	lvsl	5,0,r4
346#endif
347	lvx	3,0,r4
348	li	0,0
349	bf	31,L(setup_unaligned_loop)
350
351	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
352	lvx	4,r4,6
353#ifdef __LITTLE_ENDIAN__
354	vperm	6,4,3,5
355#else
356	vperm	6,3,4,5
357#endif
358	addi	r4,r4,16
359	stvx	6,0,r11
360	addi	r11,r11,16
361	vor	3,4,4
362	clrrdi	0,r4,60
363
364L(setup_unaligned_loop):
365	mtctr	8
366	ble	cr6,L(end_unaligned_loop)
367
368	/* Copy 32 bytes at a time using vector instructions.  */
369	.align	4
370L(unaligned_loop):
371
372	/* Note: vr6/vr10 may contain data that was already copied,
373	but in order to get proper alignment, we may have to copy
374	some portions again. This is faster than having unaligned
375	vector instructions though.  */
376
377	lvx	4,r4,6
378#ifdef __LITTLE_ENDIAN__
379	vperm	6,4,3,5
380#else
381	vperm	6,3,4,5
382#endif
383	lvx	3,r4,7
384#ifdef __LITTLE_ENDIAN__
385	vperm	10,3,4,5
386#else
387	vperm	10,4,3,5
388#endif
389	addi	r4,r4,32
390	stvx	6,0,r11
391	stvx	10,r11,6
392	addi	r11,r11,32
393	bdnz	L(unaligned_loop)
394
395	clrrdi	0,r4,60
396
397	.align	4
398L(end_unaligned_loop):
399
400	/* Check for tail bytes.  */
401	mtocrf	0x01,r5
402	beqlr	cr1
403
404	add	r4,r4,0
405
406	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
407	/* Copy 8 bytes.  */
408	bf	28,4f
409	lwz	6,0(r4)
410	lwz	7,4(r4)
411	addi	r4,r4,8
412	stw	6,0(r11)
413	stw	7,4(r11)
414	addi	r11,r11,8
4154:	/* Copy 4~7 bytes.  */
416	bf	29,L(tail2)
417	lwz	6,0(r4)
418	stw	6,0(r11)
419	bf	30,L(tail5)
420	lhz	7,4(r4)
421	sth	7,4(r11)
422	bflr	31
423	lbz	8,6(r4)
424	stb	8,6(r11)
425	/* Return original DST pointer.  */
426	blr
427
428	/* Start to memcpy backward implementation: the algorith first check if
429	   src and dest have the same alignment and if it does align both to 16
430	   bytes and copy using VSX instructions.
431	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
432	   to read two 16 bytes at time, shift/permute the bytes read and write
433	   aligned to dest.  */
434L(memmove_bwd):
435	cmpldi	cr1,r5,31
436	/* Copy is done backwards: update the pointers and check alignment.  */
437	add	r11,r3,r5
438	add	r4,r4,r5
439	mr	r0,r11
440	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
441				           code.  */
442
443	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
444	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
445	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
446
447	bne     cr6,L(copy_GE_32_unaligned_bwd)
448	beq     L(aligned_copy_bwd)
449
450	mtocrf	0x01,r0
451	clrldi	r0,r0,60
452
453/* Get the DST and SRC aligned to 16 bytes.  */
4541:
455	bf	31,2f
456	lbz	r6,-1(r4)
457	subi	r4,r4,1
458	stb	r6,-1(r11)
459	subi	r11,r11,1
4602:
461	bf	30,4f
462	lhz	r6,-2(r4)
463	subi	r4,r4,2
464	sth	r6,-2(r11)
465	subi	r11,r11,2
4664:
467	bf	29,8f
468	lwz	r6,-4(r4)
469	subi	r4,r4,4
470	stw	r6,-4(r11)
471	subi	r11,r11,4
4728:
473	bf	28,16f
474	ld	r6,-8(r4)
475	subi	r4,r4,8
476	std	r6,-8(r11)
477	subi	r11,r11,8
47816:
479	subf	r5,0,r5
480
481/* Main aligned copy loop. Copies 128 bytes at a time. */
482L(aligned_copy_bwd):
483	li	r6,-16
484	li	r7,-32
485	li	r8,-48
486	li	r9,-64
487	mtocrf	0x02,r5
488	srdi	r12,r5,7
489	cmpdi	r12,0
490	beq	L(aligned_tail_bwd)
491	lvx	v6,r4,r6
492	lvx	v7,r4,r7
493	mtctr	12
494	b	L(aligned_128loop_bwd)
495
496	.align  4
497L(aligned_128head_bwd):
498	/* for the 2nd + iteration of this loop. */
499	lvx	v6,r4,r6
500	lvx	v7,r4,r7
501L(aligned_128loop_bwd):
502	lvx	v8,r4,r8
503	lvx	v9,r4,r9
504	stvx	v6,r11,r6
505	subi	r4,r4,64
506	stvx	v7,r11,r7
507	stvx	v8,r11,r8
508	stvx	v9,r11,r9
509	lvx	v6,r4,r6
510	lvx	v7,r4,7
511	subi	r11,r11,64
512	lvx	v8,r4,r8
513	lvx	v9,r4,r9
514	subi	r4,r4,64
515	stvx	v6,r11,r6
516	stvx	v7,r11,r7
517	stvx	v8,r11,r8
518	stvx	v9,r11,r9
519	subi	r11,r11,64
520	bdnz	L(aligned_128head_bwd)
521
522L(aligned_tail_bwd):
523	mtocrf	0x01,r5
524	bf	25,32f
525	lvx	v6,r4,r6
526	lvx	v7,r4,r7
527	lvx	v8,r4,r8
528	lvx	v9,r4,r9
529	subi	r4,r4,64
530	stvx	v6,r11,r6
531	stvx	v7,r11,r7
532	stvx	v8,r11,r8
533	stvx	v9,r11,r9
534	subi	r11,r11,64
53532:
536	bf	26,16f
537	lvx	v6,r4,r6
538	lvx	v7,r4,r7
539	subi	r4,r4,32
540	stvx	v6,r11,r6
541	stvx	v7,r11,r7
542	subi	r11,r11,32
54316:
544	bf	27,8f
545	lvx	v6,r4,r6
546	subi	r4,r4,16
547	stvx	v6,r11,r6
548	subi	r11,r11,16
5498:
550	bf	28,4f
551	ld	r6,-8(r4)
552	subi	r4,r4,8
553	std     r6,-8(r11)
554	subi	r11,r11,8
5554:	/* Copies 4~7 bytes.  */
556	bf	29,L(tail2_bwd)
557	lwz	r6,-4(r4)
558	stw     r6,-4(r11)
559	bf      30,L(tail5_bwd)
560	lhz     r7,-6(r4)
561	sth     r7,-6(r11)
562	bflr	31
563	lbz     r8,-7(r4)
564	stb     r8,-7(r11)
565	/* Return original DST pointer.  */
566	blr
567
568/* Handle copies of 0~31 bytes.  */
569	.align	4
570L(copy_LT_32_bwd):
571	cmpldi	cr6,r5,8
572	mtocrf	0x01,r5
573	ble	cr6,L(copy_LE_8_bwd)
574
575	/* At least 9 bytes to go.  */
576	neg	r8,r4
577	andi.	r0,r8,3
578	cmpldi	cr1,r5,16
579	beq	L(copy_LT_32_aligned_bwd)
580
581	/* Force 4-byte alignment for SRC.  */
582	mtocrf	0x01,0
583	subf	r5,0,r5
5842:
585	bf	30,1f
586	lhz	r6,-2(r4)
587	subi	r4,r4,2
588	sth	r6,-2(r11)
589	subi	r11,r11,2
5901:
591	bf	31,L(end_4bytes_alignment_bwd)
592	lbz	6,-1(r4)
593	subi	r4,r4,1
594	stb	6,-1(r11)
595	subi	r11,r11,1
596
597	.align	4
598L(end_4bytes_alignment_bwd):
599	cmpldi	cr1,r5,16
600	mtocrf	0x01,r5
601
602L(copy_LT_32_aligned_bwd):
603	/* At least 6 bytes to go, and SRC is word-aligned.  */
604	blt	cr1,8f
605
606	/* Copy 16 bytes.  */
607	lwz	r6,-4(r4)
608	lwz	r7,-8(r4)
609	stw	r6,-4(r11)
610	lwz	r8,-12(r4)
611	stw	r7,-8(r11)
612	lwz	r6,-16(r4)
613	subi	r4,r4,16
614	stw	r8,-12(r11)
615	stw	r6,-16(r11)
616	subi	r11,r11,16
6178:	/* Copy 8 bytes.  */
618	bf	28,L(tail4_bwd)
619	lwz	r6,-4(r4)
620	lwz	r7,-8(r4)
621	subi	r4,r4,8
622	stw	r6,-4(r11)
623	stw	r7,-8(r11)
624	subi	r11,r11,8
625
626	.align	4
627/* Copies 4~7 bytes.  */
628L(tail4_bwd):
629	bf	29,L(tail2_bwd)
630	lwz	6,-4(r4)
631	stw	6,-4(r11)
632	bf	30,L(tail5_bwd)
633	lhz	7,-6(r4)
634	sth	7,-6(r11)
635	bflr	31
636	lbz	8,-7(r4)
637	stb	8,-7(r11)
638	/* Return original DST pointer.  */
639	blr
640
641	.align	4
642/* Copies 2~3 bytes.  */
643L(tail2_bwd):
644	bf	30,1f
645	lhz	6,-2(r4)
646	sth	6,-2(r11)
647	bflr	31
648	lbz	7,-3(r4)
649	stb	7,-3(r11)
650	blr
651
652	.align	4
653L(tail5_bwd):
654	bflr	31
655	lbz	6,-5(r4)
656	stb	6,-5(r11)
657	blr
658
659	.align	4
6601:
661	bflr	31
662	lbz	6,-1(r4)
663	stb	6,-1(r11)
664	/* Return original DST pointer.  */
665	blr
666
667
668/* Handles copies of 0~8 bytes.  */
669	.align	4
670L(copy_LE_8_bwd):
671	bne	cr6,L(tail4_bwd)
672
673	/* Though we could've used ld/std here, they are still
674	   slow for unaligned cases.  */
675	lwz	6,-8(r4)
676	lwz	7,-4(r4)
677	stw	6,-8(r11)
678	stw	7,-4(r11)
679	blr
680
681
682/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
683   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
684   the data, allowing for aligned DST stores.  */
685	.align	4
686L(copy_GE_32_unaligned_bwd):
687	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
688	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
689
690	beq	L(copy_GE_32_unaligned_cont_bwd)
691
692	/* DST is not quadword aligned and r10 holds the address masked to
693           compare alignments.  */
694	mtocrf	0x01,r10
695	subf	r5,r10,r5
696
697	/* Vector instructions work best when proper alignment (16-bytes)
698	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
6991:
700	bf	31,2f
701	lbz	r6,-1(r4)
702	subi	r4,r4,1
703	stb	r6,-1(r11)
704	subi	r11,r11,1
7052:
706	bf	30,4f
707	lhz	r6,-2(r4)
708	subi	r4,r4,2
709	sth	r6,-2(r11)
710	subi	r11,r11,2
7114:
712	bf	29,8f
713	lwz	r6,-4(r4)
714	subi	r4,r4,4
715	stw	r6,-4(r11)
716	subi	r11,r11,4
7178:
718	bf	28,0f
719	ld	r6,-8(r4)
720	subi	r4,r4,8
721	std	r6,-8(r11)
722	subi	r11,r11,8
7230:
724	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
725
726	/* The proper alignment is present, it is OK to copy the bytes now.  */
727L(copy_GE_32_unaligned_cont_bwd):
728
729	/* Setup two indexes to speed up the indexed vector operations.  */
730	clrldi	r10,r5,60
731	li	r6,-16	      /* Index for 16-bytes offsets.  */
732	li	r7,-32	      /* Index for 32-bytes offsets.  */
733	cmpldi	cr1,10,0
734	srdi	r8,r5,5	      /* Setup the loop counter.  */
735	mtocrf	0x01,9
736	cmpldi	cr6,r9,1
737#ifdef __LITTLE_ENDIAN__
738	lvsr	v5,r0,r4
739#else
740	lvsl	v5,r0,r4
741#endif
742	lvx	v3,0,r4
743	li	r0,0
744	bf	31,L(setup_unaligned_loop_bwd)
745
746	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
747	lvx	v4,r4,r6
748#ifdef __LITTLE_ENDIAN__
749	vperm	v6,v3,v4,v5
750#else
751	vperm	v6,v4,v3,v5
752#endif
753	subi	r4,r4,16
754	stvx	v6,r11,r6
755	subi	r11,r11,16
756	vor	v3,v4,v4
757	clrrdi	r0,r4,60
758
759L(setup_unaligned_loop_bwd):
760	mtctr	r8
761	ble	cr6,L(end_unaligned_loop_bwd)
762
763	/* Copy 32 bytes at a time using vector instructions.  */
764	.align	4
765L(unaligned_loop_bwd):
766
767	/* Note: vr6/vr10 may contain data that was already copied,
768	but in order to get proper alignment, we may have to copy
769	some portions again. This is faster than having unaligned
770	vector instructions though.  */
771
772	lvx	v4,r4,r6
773#ifdef __LITTLE_ENDIAN__
774	vperm	v6,v3,v4,v5
775#else
776	vperm	v6,v4,v3,v5
777#endif
778	lvx	v3,r4,r7
779#ifdef __LITTLE_ENDIAN__
780	vperm	v10,v4,v3,v5
781#else
782	vperm	v10,v3,v4,v5
783#endif
784	subi	r4,r4,32
785	stvx	v6,r11,r6
786	stvx	v10,r11,r7
787	subi	r11,r11,32
788	bdnz	L(unaligned_loop_bwd)
789
790	clrrdi	r0,r4,60
791
792	.align	4
793L(end_unaligned_loop_bwd):
794
795	/* Check for tail bytes.  */
796	mtocrf	0x01,r5
797	beqlr	cr1
798
799	add	r4,r4,0
800
801	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
802	/* Copy 8 bytes.  */
803	bf	28,4f
804	lwz	r6,-4(r4)
805	lwz	r7,-8(r4)
806	subi	r4,r4,8
807	stw	r6,-4(r11)
808	stw	r7,-8(r11)
809	subi	r11,r11,8
8104:	/* Copy 4~7 bytes.  */
811	bf	29,L(tail2_bwd)
812	lwz	r6,-4(r4)
813	stw	r6,-4(r11)
814	bf	30,L(tail5_bwd)
815	lhz	r7,-6(r4)
816	sth	r7,-6(r11)
817	bflr	31
818	lbz	r8,-7(r4)
819	stb	r8,-7(r11)
820	/* Return original DST pointer.  */
821	blr
822END_GEN_TB (MEMMOVE, TB_TOCLESS)
823libc_hidden_builtin_def (memmove)
824