1/* Optimized memcpy implementation for PowerPC64.
2   Copyright (C) 2003-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.
23
24   Memcpy handles short copies (< 32-bytes) using a binary move blocks
25   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
26   with the appropriate combination of byte and halfword load/stores.
27   There is minimal effort to optimize the alignment of short moves.
28   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29   of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31   Longer moves (>= 32-bytes) justify the effort to get at least the
32   destination doubleword (8-byte) aligned.  Further optimization is
33   possible when both source and destination are doubleword aligned.
34   Each case has a optimized unrolled loop.
35
36   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
37   L1 cache miss that crosses a 32- or 128-byte boundary.  Store
38   is more forgiving and does not take a hiccup until page or
39   segment boundaries.  So we require doubleword alignment for
40   the source but may take a risk and only require word alignment
41   for the destination.  */
42
43#ifndef MEMCPY
44# define MEMCPY memcpy
45#endif
46	.machine	"power6"
47ENTRY_TOCLESS (MEMCPY, 7)
48	CALL_MCOUNT 3
49
50    cmpldi cr1,5,31
51    neg   0,3
52    std   3,-16(1)
53    std   31,-8(1)
54    andi. 11,3,7	/* check alignment of dst.  */
55    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
56    clrldi 10,4,61	/* check alignment of src.  */
57    cmpldi cr6,5,8
58    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
59    mtcrf 0x01,0
60    cmpld cr6,10,11
61    srdi  9,5,3		/* Number of full double words remaining.  */
62    beq   .L0
63
64    subf  5,0,5
65  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
66     Duplicate some code to maximize fall-through and minimize agen delays.  */
671:  bf    31,2f
68    lbz   6,0(4)
69    stb   6,0(3)
70    bf    30,5f
71    lhz   6,1(4)
72    sth   6,1(3)
73    bf    29,0f
74    lwz   6,3(4)
75    stw   6,3(3)
76    b     0f
775:
78    bf    29,0f
79    lwz   6,1(4)
80    stw   6,1(3)
81    b     0f
82
832:  bf    30,4f
84    lhz   6,0(4)
85    sth   6,0(3)
86    bf    29,0f
87    lwz   6,2(4)
88    stw   6,2(3)
89    b     0f
90
914:  bf    29,0f
92    lwz   6,0(4)
93    stw   6,0(3)
940:
95/* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
96    add   4,4,0
97    add   3,3,0
98
99    clrldi 10,4,61	/* check alignment of src again.  */
100    srdi  9,5,3	/* Number of full double words remaining.  */
101
102  /* Copy doublewords from source to destination, assuming the
103     destination is aligned on a doubleword boundary.
104
105     At this point we know there are at least 25 bytes left (32-7) to copy.
106     The next step is to determine if the source is also doubleword aligned.
107     If not branch to the unaligned move code at .L6. which uses
108     a load, shift, store strategy.
109
110     Otherwise source and destination are doubleword aligned, and we can
111     the optimized doubleword copy loop.  */
112    .align  4
113.L0:
114    clrldi  11,5,61
115    andi.   0,5,0x78
116    srdi    12,5,7	/* Number of 128-byte blocks to move.  */
117    cmpldi  cr1,11,0	/* If the tail is 0 bytes  */
118    bne-    cr6,.L6     /* If source is not DW aligned.  */
119
120  /* Move doublewords where destination and source are DW aligned.
121     Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
122     If the copy is not an exact multiple of 128 bytes, 1-15
123     doublewords are copied as needed to set up the main loop.  After
124     the main loop exits there may be a tail of 1-7 bytes. These byte
125     are copied a word/halfword/byte at a time as needed to preserve
126     alignment.
127
128     For POWER6 the L1 is store-through and the L2 is store-in.  The
129     L2 is clocked at half CPU clock so we can store 16 bytes every
130     other cycle.  POWER6 also has a load/store bypass so we can do
131     load, load, store, store every 2 cycles.
132
133     The following code is sensitive to cache line alignment.  Do not
134     make any change with out first making sure they don't result in
135     splitting ld/std pairs across a cache line.  */
136
137    mtcrf 0x02,5
138    mtcrf 0x01,5
139    cmpldi  cr5,12,1
140    beq   L(das_loop)
141
142    bf    25,4f
143    .align  3
144    ld    6,0(4)
145    ld    7,8(4)
146    mr    11,4
147    mr    10,3
148    std   6,0(3)
149    std   7,8(3)
150    ld    6,16(4)
151    ld    7,24(4)
152    std   6,16(3)
153    std   7,24(3)
154    ld    6,0+32(4)
155    ld    7,8+32(4)
156    addi  4,4,64
157    addi  3,3,64
158    std   6,0+32(10)
159    std   7,8+32(10)
160    ld    6,16+32(11)
161    ld    7,24+32(11)
162    std   6,16+32(10)
163    std   7,24+32(10)
1644:
165    mr    10,3
166    bf    26,2f
167    ld    6,0(4)
168    ld    7,8(4)
169    mr    11,4
170    nop
171    std   6,0(3)
172    std   7,8(3)
173    ld    6,16(4)
174    ld    7,24(4)
175    addi  4,4,32
176    std   6,16(3)
177    std   7,24(3)
178    addi  3,3,32
1796:
180    nop
181    bf    27,5f
182    ld    6,0+32(11)
183    ld    7,8+32(11)
184    addi  4,4,16
185    addi  3,3,16
186    std   6,0+32(10)
187    std   7,8+32(10)
188    bf    28,L(das_loop_s)
189    ld    0,16+32(11)
190    addi  4,4,8
191    addi  3,3,8
192    std   0,16+32(10)
193    blt   cr5,L(das_tail)
194    b     L(das_loop)
195    .align  3
1965:
197    nop
198    bf    28,L(das_loop_s)
199    ld    6,32(11)
200    addi  4,4,8
201    addi  3,3,8
202    std   6,32(10)
203    blt   cr5,L(das_tail)
204    b     L(das_loop)
205    .align  3
2062:
207    mr    11,4
208    bf    27,1f
209    ld    6,0(4)
210    ld    7,8(4)
211    addi  4,4,16
212    addi  3,3,16
213    std   6,0(10)
214    std   7,8(10)
215    bf    28,L(das_loop_s)
216    ld    0,16(11)
217    addi  4,11,24
218    addi  3,10,24
219    std   0,16(10)
220    blt   cr5,L(das_tail)
221    b     L(das_loop)
222    .align  3
2231:
224    nop
225    bf    28,L(das_loop_s)
226    ld    6,0(4)
227    addi  4,4,8
228    addi  3,3,8
229    std   6,0(10)
230L(das_loop_s):
231    nop
232    blt   cr5,L(das_tail)
233    .align  4
234L(das_loop):
235    ld    6,0(4)
236    ld    7,8(4)
237    mr    10,3
238    mr    11,4
239    std   6,0(3)
240    std   7,8(3)
241    addi  12,12,-1
242    nop
243    ld    8,16(4)
244    ld    0,24(4)
245    std   8,16(3)
246    std   0,24(3)
247
248    ld    6,0+32(4)
249    ld    7,8+32(4)
250    std   6,0+32(3)
251    std   7,8+32(3)
252    ld    8,16+32(4)
253    ld    0,24+32(4)
254    std   8,16+32(3)
255    std   0,24+32(3)
256
257    ld    6,0+64(11)
258    ld    7,8+64(11)
259    std   6,0+64(10)
260    std   7,8+64(10)
261    ld    8,16+64(11)
262    ld    0,24+64(11)
263    std   8,16+64(10)
264    std   0,24+64(10)
265
266    ld    6,0+96(11)
267    ld    7,8+96(11)
268    addi  4,4,128
269    addi  3,3,128
270    std   6,0+96(10)
271    std   7,8+96(10)
272    ld    8,16+96(11)
273    ld    0,24+96(11)
274    std   8,16+96(10)
275    std   0,24+96(10)
276    ble   cr5,L(das_loop_e)
277
278    mtctr   12
279    .align  4
280L(das_loop2):
281    ld    6,0(4)
282    ld    7,8(4)
283    mr    10,3
284    mr    11,4
285    std   6,0(3)
286    std   7,8(3)
287    ld    8,16(4)
288    ld    0,24(4)
289    std   8,16(3)
290    std   0,24(3)
291
292    ld    6,0+32(4)
293    ld    7,8+32(4)
294    std   6,0+32(3)
295    std   7,8+32(3)
296    ld    8,16+32(4)
297    ld    0,24+32(4)
298    std   8,16+32(3)
299    std   0,24+32(3)
300
301    ld    6,0+64(11)
302    ld    7,8+64(11)
303    std   6,0+64(10)
304    std   7,8+64(10)
305    ld    8,16+64(11)
306    ld    0,24+64(11)
307    std   8,16+64(10)
308    std   0,24+64(10)
309
310    ld    6,0+96(11)
311    ld    7,8+96(11)
312    addi  4,4,128
313    addi  3,3,128
314    std   6,0+96(10)
315    std   7,8+96(10)
316    ld    8,16+96(11)
317    ld    0,24+96(11)
318    std   8,16+96(10)
319    std   0,24+96(10)
320    bdnz  L(das_loop2)
321L(das_loop_e):
322/* Check of a 1-7 byte tail, return if none.  */
323    bne   cr1,L(das_tail2)
324/* Return original dst pointer.  */
325    ld 3,-16(1)
326    blr
327    .align  4
328L(das_tail):
329    beq   cr1,0f
330
331L(das_tail2):
332/*  At this point we have a tail of 0-7 bytes and we know that the
333    destination is double word aligned.  */
3344:  bf    29,2f
335    lwz   6,0(4)
336    stw   6,0(3)
337    bf    30,5f
338    lhz   6,4(4)
339    sth   6,4(3)
340    bf    31,0f
341    lbz   6,6(4)
342    stb   6,6(3)
343    b     0f
3445:  bf    31,0f
345    lbz   6,4(4)
346    stb   6,4(3)
347    b     0f
348
3492:  bf    30,1f
350    lhz   6,0(4)
351    sth   6,0(3)
352    bf    31,0f
353    lbz   6,2(4)
354    stb   6,2(3)
355    b     0f
356
3571:  bf    31,0f
358    lbz   6,0(4)
359    stb   6,0(3)
3600:
361  /* Return original dst pointer.  */
362    ld 3,-16(1)
363    blr
364
365/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
366   bytes.  Each case is handled without loops, using binary (1,2,4,8)
367   tests.
368
369   In the short (0-8 byte) case no attempt is made to force alignment
370   of either source or destination.  The hardware will handle the
371   unaligned load/stores with small delays for crossing 32- 128-byte,
372   and 4096-byte boundaries. Since these short moves are unlikely to be
373   unaligned or cross these boundaries, the overhead to force
374   alignment is not justified.
375
376   The longer (9-31 byte) move is more likely to cross 32- or 128-byte
377   boundaries.  Since only loads are sensitive to the 32-/128-byte
378   boundaries it is more important to align the source then the
379   destination.  If the source is not already word aligned, we first
380   move 1-3 bytes as needed.  Since we are only word aligned we don't
381   use double word load/stores to insure that all loads are aligned.
382   While the destination and stores may still be unaligned, this
383   is only an issue for page (4096 byte boundary) crossing, which
384   should be rare for these short moves.  The hardware handles this
385   case automatically with a small (~20 cycle) delay.  */
386    .align  4
387.L2:
388    mtcrf 0x01,5
389    neg   8,4
390    clrrdi	11,4,2
391    andi. 0,8,3
392    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
393/* At least 9 bytes left.  Get the source word aligned.  */
394    cmpldi	cr1,5,16
395    mr    10,5
396    mr    12,4
397    cmpldi	cr6,0,2
398    beq   L(dus_tail)	/* If the source is already word aligned skip this.  */
399/* Copy 1-3 bytes to get source address word aligned.  */
400    lwz   6,0(11)
401    subf  10,0,5
402    add   12,4,0
403    blt   cr6,5f
404    srdi  7,6,16
405    bgt	  cr6,3f
406#ifdef __LITTLE_ENDIAN__
407    sth   7,0(3)
408#else
409    sth   6,0(3)
410#endif
411    b     7f
412    .align  4
4133:
414#ifdef __LITTLE_ENDIAN__
415    rotlwi 6,6,24
416    stb   6,0(3)
417    sth   7,1(3)
418#else
419    stb   7,0(3)
420    sth   6,1(3)
421#endif
422    b     7f
423    .align  4
4245:
425#ifdef __LITTLE_ENDIAN__
426    rotlwi 6,6,8
427#endif
428    stb   6,0(3)
4297:
430    cmpldi	cr1,10,16
431    add   3,3,0
432    mtcrf 0x01,10
433    .align  4
434L(dus_tail):
435/* At least 6 bytes left and the source is word aligned.  This allows
436   some speculative loads up front.  */
437/* We need to special case the fall-through because the biggest delays
438   are due to address computation not being ready in time for the
439   AGEN.  */
440    lwz   6,0(12)
441    lwz   7,4(12)
442    blt   cr1,L(dus_tail8)
443    cmpldi	cr0,10,24
444L(dus_tail16): /* Move 16 bytes.  */
445    stw   6,0(3)
446    stw   7,4(3)
447    lwz   6,8(12)
448    lwz   7,12(12)
449    stw   6,8(3)
450    stw   7,12(3)
451/* Move 8 bytes more.  */
452    bf    28,L(dus_tail16p8)
453    cmpldi	cr1,10,28
454    lwz   6,16(12)
455    lwz   7,20(12)
456    stw   6,16(3)
457    stw   7,20(3)
458/* Move 4 bytes more.  */
459    bf    29,L(dus_tail16p4)
460    lwz   6,24(12)
461    stw   6,24(3)
462    addi  12,12,28
463    addi  3,3,28
464    bgt   cr1,L(dus_tail2)
465 /* exactly 28 bytes.  Return original dst pointer and exit.  */
466    ld    3,-16(1)
467    blr
468    .align  4
469L(dus_tail16p8):  /* less than 8 bytes left.  */
470    beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
471    cmpldi	cr1,10,20
472    bf    29,L(dus_tail16p2)
473/* Move 4 bytes more.  */
474    lwz   6,16(12)
475    stw   6,16(3)
476    addi  12,12,20
477    addi  3,3,20
478    bgt   cr1,L(dus_tail2)
479 /* exactly 20 bytes.  Return original dst pointer and exit.  */
480    ld    3,-16(1)
481    blr
482    .align  4
483L(dus_tail16p4):  /* less than 4 bytes left.  */
484    addi  12,12,24
485    addi  3,3,24
486    bgt   cr0,L(dus_tail2)
487 /* exactly 24 bytes.  Return original dst pointer and exit.  */
488    ld    3,-16(1)
489    blr
490    .align  4
491L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
492    addi  12,12,16
493    addi  3,3,16
494    b     L(dus_tail2)
495
496    .align  4
497L(dus_tail8):  /* Move 8 bytes.  */
498/*  r6, r7 already loaded speculatively.  */
499    cmpldi	cr1,10,8
500    cmpldi	cr0,10,12
501    bf    28,L(dus_tail4)
502    .align  2
503    stw   6,0(3)
504    stw   7,4(3)
505/* Move 4 bytes more.  */
506    bf    29,L(dus_tail8p4)
507    lwz   6,8(12)
508    stw   6,8(3)
509    addi  12,12,12
510    addi  3,3,12
511    bgt   cr0,L(dus_tail2)
512 /* exactly 12 bytes.  Return original dst pointer and exit.  */
513    ld    3,-16(1)
514    blr
515    .align  4
516L(dus_tail8p4):  /* less than 4 bytes left.  */
517    addi  12,12,8
518    addi  3,3,8
519    bgt   cr1,L(dus_tail2)
520 /* exactly 8 bytes.  Return original dst pointer and exit.  */
521    ld    3,-16(1)
522    blr
523
524    .align  4
525L(dus_tail4):  /* Move 4 bytes.  */
526/*  r6 already loaded speculatively.  If we are here we know there is
527    more than 4 bytes left.  So there is no need to test.  */
528    addi  12,12,4
529    stw   6,0(3)
530    addi  3,3,4
531L(dus_tail2):  /* Move 2-3 bytes.  */
532    bf    30,L(dus_tail1)
533    lhz   6,0(12)
534    sth   6,0(3)
535    bf    31,L(dus_tailX)
536    lbz   7,2(12)
537    stb   7,2(3)
538    ld 3,-16(1)
539    blr
540L(dus_tail1):  /* Move 1 byte.  */
541    bf    31,L(dus_tailX)
542    lbz   6,0(12)
543    stb   6,0(3)
544L(dus_tailX):
545  /* Return original dst pointer.  */
546    ld    3,-16(1)
547    blr
548
549/* Special case to copy 0-8 bytes.  */
550    .align  4
551.LE8:
552    mr    12,4
553    bne   cr6,L(dus_4)
554/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
555   cycle delay.  This case should be rare and any attempt to avoid this
556   would take most of 20 cycles any way.  */
557    ld   6,0(4)
558    std   6,0(3)
559  /* Return original dst pointer.  */
560    ld    3,-16(1)
561    blr
562    .align  4
563L(dus_4):
564    bf    29,L(dus_tail2)
565    lwz   6,0(4)
566    stw   6,0(3)
567    bf    30,L(dus_5)
568    lhz   7,4(4)
569    sth   7,4(3)
570    bf    31,L(dus_0)
571    lbz   8,6(4)
572    stb   8,6(3)
573    ld 3,-16(1)
574    blr
575    .align  4
576L(dus_5):
577    bf    31,L(dus_0)
578    lbz   6,4(4)
579    stb   6,4(3)
580L(dus_0):
581  /* Return original dst pointer.  */
582    ld    3,-16(1)
583    blr
584
585    .align  4
586.L6:
587    cfi_offset(31,-8)
588    mr    12,4
589    mr    31,5
590  /* Copy doublewords where the destination is aligned but the source is
591     not.  Use aligned doubleword loads from the source, shifted to realign
592     the data, to allow aligned destination stores.  */
593    addi    11,9,-1  /* loop DW count is one less than total */
594    subf    5,10,12  /* Move source addr to previous full double word.  */
595    cmpldi  cr5, 10, 2
596    cmpldi  cr0, 10, 4
597    mr      4,3
598    srdi    8,11,2   /* calculate the 32 byte loop count */
599    ld      6,0(5)   /* pre load 1st full doubleword.  */
600    mtcrf   0x01,11
601    cmpldi  cr6,9,4
602    mtctr   8
603    ld      7,8(5)   /* pre load 2nd full doubleword.  */
604    bge     cr0, L(du4_do)
605    blt     cr5, L(du1_do)
606    beq     cr5, L(du2_do)
607    b       L(du3_do)
608
609    .align 4
610L(du1_do):
611    bf      30,L(du1_1dw)
612
613    /* there are at least two DWs to copy */
614    /* FIXME: can combine last shift and "or" into "rldimi" */
615#ifdef __LITTLE_ENDIAN__
616    srdi     0,6, 8
617    sldi     8,7, 64-8
618#else
619    sldi     0,6, 8
620    srdi     8,7, 64-8
621#endif
622    or      0,0,8
623    ld      6,16(5)
624    std     0,0(4)
625#ifdef __LITTLE_ENDIAN__
626    srdi     0,7, 8
627    sldi     8,6, 64-8
628#else
629    sldi     0,7, 8
630    srdi     8,6, 64-8
631#endif
632    or      0,0,8
633    ld      7,24(5)
634    std     0,8(4)
635    addi    4,4,16
636    addi    5,5,32
637    blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
638    bf      31,L(du1_loop)
639    /* there is a third DW to copy */
640#ifdef __LITTLE_ENDIAN__
641    srdi     0,6, 8
642    sldi     8,7, 64-8
643#else
644    sldi     0,6, 8
645    srdi     8,7, 64-8
646#endif
647    or      0,0,8
648    std     0,0(4)
649    mr      6,7
650    ld      7,0(5)
651    addi    5,5,8
652    addi    4,4,8
653    beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
654    b       L(du1_loop)
655    .align 4
656L(du1_1dw):
657#ifdef __LITTLE_ENDIAN__
658    srdi     0,6, 8
659    sldi     8,7, 64-8
660#else
661    sldi     0,6, 8
662    srdi     8,7, 64-8
663#endif
664    addi    5,5,16
665    or      0,0,8
666    bf      31,L(du1_loop)
667    mr      6,7
668    ld      7,0(5)
669    addi    5,5,8
670    std     0,0(4)
671    addi    4,4,8
672    .align 4
673/* copy 32 bytes at a time */
674L(du1_loop):
675#ifdef __LITTLE_ENDIAN__
676    srdi   0,6, 8
677    sldi   8,7, 64-8
678#else
679    sldi   0,6, 8
680    srdi   8,7, 64-8
681#endif
682    or    0,0,8
683    ld    6,0(5)
684    std   0,0(4)
685#ifdef __LITTLE_ENDIAN__
686    srdi   0,7, 8
687    sldi   8,6, 64-8
688#else
689    sldi   0,7, 8
690    srdi   8,6, 64-8
691#endif
692    or    0,0,8
693    ld    7,8(5)
694    std   0,8(4)
695#ifdef __LITTLE_ENDIAN__
696    srdi   0,6, 8
697    sldi   8,7, 64-8
698#else
699    sldi   0,6, 8
700    srdi   8,7, 64-8
701#endif
702    or    0,0,8
703    ld    6,16(5)
704    std   0,16(4)
705#ifdef __LITTLE_ENDIAN__
706    srdi   0,7, 8
707    sldi   8,6, 64-8
708#else
709    sldi   0,7, 8
710    srdi   8,6, 64-8
711#endif
712    or    0,0,8
713    ld    7,24(5)
714    std   0,24(4)
715    addi  5,5,32
716    addi  4,4,32
717    bdnz+ L(du1_loop)
718    .align 4
719L(du1_fini):
720    /* calculate and store the final DW */
721#ifdef __LITTLE_ENDIAN__
722    srdi   0,6, 8
723    sldi   8,7, 64-8
724#else
725    sldi   0,6, 8
726    srdi   8,7, 64-8
727#endif
728    or    0,0,8
729    std   0,0(4)
730    b     L(du_done)
731
732    .align 4
733L(du2_do):
734    bf      30,L(du2_1dw)
735
736    /* there are at least two DWs to copy */
737#ifdef __LITTLE_ENDIAN__
738    srdi     0,6, 16
739    sldi     8,7, 64-16
740#else
741    sldi     0,6, 16
742    srdi     8,7, 64-16
743#endif
744    or      0,0,8
745    ld      6,16(5)
746    std     0,0(4)
747#ifdef __LITTLE_ENDIAN__
748    srdi     0,7, 16
749    sldi     8,6, 64-16
750#else
751    sldi     0,7, 16
752    srdi     8,6, 64-16
753#endif
754    or      0,0,8
755    ld      7,24(5)
756    std     0,8(4)
757    addi    4,4,16
758    addi    5,5,32
759    blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
760    bf      31,L(du2_loop)
761    /* there is a third DW to copy */
762#ifdef __LITTLE_ENDIAN__
763    srdi     0,6, 16
764    sldi     8,7, 64-16
765#else
766    sldi     0,6, 16
767    srdi     8,7, 64-16
768#endif
769    or      0,0,8
770    std     0,0(4)
771    mr      6,7
772    ld      7,0(5)
773    addi    5,5,8
774    addi    4,4,8
775    beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
776    b       L(du2_loop)
777    .align 4
778L(du2_1dw):
779#ifdef __LITTLE_ENDIAN__
780    srdi     0,6, 16
781    sldi     8,7, 64-16
782#else
783    sldi     0,6, 16
784    srdi     8,7, 64-16
785#endif
786    addi    5,5,16
787    or      0,0,8
788    bf      31,L(du2_loop)
789    mr      6,7
790    ld      7,0(5)
791    addi    5,5,8
792    std     0,0(4)
793    addi    4,4,8
794    .align 4
795/* copy 32 bytes at a time */
796L(du2_loop):
797#ifdef __LITTLE_ENDIAN__
798    srdi   0,6, 16
799    sldi   8,7, 64-16
800#else
801    sldi   0,6, 16
802    srdi   8,7, 64-16
803#endif
804    or    0,0,8
805    ld    6,0(5)
806    std   0,0(4)
807#ifdef __LITTLE_ENDIAN__
808    srdi   0,7, 16
809    sldi   8,6, 64-16
810#else
811    sldi   0,7, 16
812    srdi   8,6, 64-16
813#endif
814    or    0,0,8
815    ld    7,8(5)
816    std   0,8(4)
817#ifdef __LITTLE_ENDIAN__
818    srdi   0,6, 16
819    sldi   8,7, 64-16
820#else
821    sldi   0,6, 16
822    srdi   8,7, 64-16
823#endif
824    or    0,0,8
825    ld    6,16(5)
826    std   0,16(4)
827#ifdef __LITTLE_ENDIAN__
828    srdi   0,7, 16
829    sldi   8,6, 64-16
830#else
831    sldi   0,7, 16
832    srdi   8,6, 64-16
833#endif
834    or    0,0,8
835    ld    7,24(5)
836    std   0,24(4)
837    addi  5,5,32
838    addi  4,4,32
839    bdnz+ L(du2_loop)
840    .align 4
841L(du2_fini):
842    /* calculate and store the final DW */
843#ifdef __LITTLE_ENDIAN__
844    srdi   0,6, 16
845    sldi   8,7, 64-16
846#else
847    sldi   0,6, 16
848    srdi   8,7, 64-16
849#endif
850    or    0,0,8
851    std   0,0(4)
852    b     L(du_done)
853
854    .align 4
855L(du3_do):
856    bf      30,L(du3_1dw)
857
858    /* there are at least two DWs to copy */
859#ifdef __LITTLE_ENDIAN__
860    srdi     0,6, 24
861    sldi     8,7, 64-24
862#else
863    sldi     0,6, 24
864    srdi     8,7, 64-24
865#endif
866    or      0,0,8
867    ld      6,16(5)
868    std     0,0(4)
869#ifdef __LITTLE_ENDIAN__
870    srdi     0,7, 24
871    sldi     8,6, 64-24
872#else
873    sldi     0,7, 24
874    srdi     8,6, 64-24
875#endif
876    or      0,0,8
877    ld      7,24(5)
878    std     0,8(4)
879    addi    4,4,16
880    addi    5,5,32
881    blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
882    bf      31,L(du3_loop)
883    /* there is a third DW to copy */
884#ifdef __LITTLE_ENDIAN__
885    srdi     0,6, 24
886    sldi     8,7, 64-24
887#else
888    sldi     0,6, 24
889    srdi     8,7, 64-24
890#endif
891    or      0,0,8
892    std     0,0(4)
893    mr      6,7
894    ld      7,0(5)
895    addi    5,5,8
896    addi    4,4,8
897    beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
898    b       L(du3_loop)
899    .align 4
900L(du3_1dw):
901#ifdef __LITTLE_ENDIAN__
902    srdi     0,6, 24
903    sldi     8,7, 64-24
904#else
905    sldi     0,6, 24
906    srdi     8,7, 64-24
907#endif
908    addi    5,5,16
909    or      0,0,8
910    bf      31,L(du3_loop)
911    mr      6,7
912    ld      7,0(5)
913    addi    5,5,8
914    std     0,0(4)
915    addi    4,4,8
916    .align 4
917/* copy 32 bytes at a time */
918L(du3_loop):
919#ifdef __LITTLE_ENDIAN__
920    srdi   0,6, 24
921    sldi   8,7, 64-24
922#else
923    sldi   0,6, 24
924    srdi   8,7, 64-24
925#endif
926    or    0,0,8
927    ld    6,0(5)
928    std   0,0(4)
929#ifdef __LITTLE_ENDIAN__
930    srdi   0,7, 24
931    sldi   8,6, 64-24
932#else
933    sldi   0,7, 24
934    srdi   8,6, 64-24
935#endif
936    or    0,0,8
937    ld    7,8(5)
938    std   0,8(4)
939#ifdef __LITTLE_ENDIAN__
940    srdi   0,6, 24
941    sldi   8,7, 64-24
942#else
943    sldi   0,6, 24
944    srdi   8,7, 64-24
945#endif
946    or    0,0,8
947    ld    6,16(5)
948    std   0,16(4)
949#ifdef __LITTLE_ENDIAN__
950    srdi   0,7, 24
951    sldi   8,6, 64-24
952#else
953    sldi   0,7, 24
954    srdi   8,6, 64-24
955#endif
956    or    0,0,8
957    ld    7,24(5)
958    std   0,24(4)
959    addi  5,5,32
960    addi  4,4,32
961    bdnz+ L(du3_loop)
962    .align 4
963L(du3_fini):
964    /* calculate and store the final DW */
965#ifdef __LITTLE_ENDIAN__
966    srdi   0,6, 24
967    sldi   8,7, 64-24
968#else
969    sldi   0,6, 24
970    srdi   8,7, 64-24
971#endif
972    or    0,0,8
973    std   0,0(4)
974    b     L(du_done)
975
976    .align 4
977L(du4_do):
978    cmpldi  cr5, 10, 6
979    beq     cr0, L(du4_dox)
980    blt     cr5, L(du5_do)
981    beq     cr5, L(du6_do)
982    b       L(du7_do)
983L(du4_dox):
984    bf      30,L(du4_1dw)
985
986    /* there are at least two DWs to copy */
987#ifdef __LITTLE_ENDIAN__
988    srdi     0,6, 32
989    sldi     8,7, 64-32
990#else
991    sldi     0,6, 32
992    srdi     8,7, 64-32
993#endif
994    or      0,0,8
995    ld      6,16(5)
996    std     0,0(4)
997#ifdef __LITTLE_ENDIAN__
998    srdi     0,7, 32
999    sldi     8,6, 64-32
1000#else
1001    sldi     0,7, 32
1002    srdi     8,6, 64-32
1003#endif
1004    or      0,0,8
1005    ld      7,24(5)
1006    std     0,8(4)
1007    addi    4,4,16
1008    addi    5,5,32
1009    blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
1010    bf      31,L(du4_loop)
1011    /* there is a third DW to copy */
1012#ifdef __LITTLE_ENDIAN__
1013    srdi     0,6, 32
1014    sldi     8,7, 64-32
1015#else
1016    sldi     0,6, 32
1017    srdi     8,7, 64-32
1018#endif
1019    or      0,0,8
1020    std     0,0(4)
1021    mr      6,7
1022    ld      7,0(5)
1023    addi    5,5,8
1024    addi    4,4,8
1025    beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
1026    b       L(du4_loop)
1027    .align 4
1028L(du4_1dw):
1029#ifdef __LITTLE_ENDIAN__
1030    srdi     0,6, 32
1031    sldi     8,7, 64-32
1032#else
1033    sldi     0,6, 32
1034    srdi     8,7, 64-32
1035#endif
1036    addi    5,5,16
1037    or      0,0,8
1038    bf      31,L(du4_loop)
1039    mr      6,7
1040    ld      7,0(5)
1041    addi    5,5,8
1042    std     0,0(4)
1043    addi    4,4,8
1044    .align 4
1045/* copy 32 bytes at a time */
1046L(du4_loop):
1047#ifdef __LITTLE_ENDIAN__
1048    srdi   0,6, 32
1049    sldi   8,7, 64-32
1050#else
1051    sldi   0,6, 32
1052    srdi   8,7, 64-32
1053#endif
1054    or    0,0,8
1055    ld    6,0(5)
1056    std   0,0(4)
1057#ifdef __LITTLE_ENDIAN__
1058    srdi   0,7, 32
1059    sldi   8,6, 64-32
1060#else
1061    sldi   0,7, 32
1062    srdi   8,6, 64-32
1063#endif
1064    or    0,0,8
1065    ld    7,8(5)
1066    std   0,8(4)
1067#ifdef __LITTLE_ENDIAN__
1068    srdi   0,6, 32
1069    sldi   8,7, 64-32
1070#else
1071    sldi   0,6, 32
1072    srdi   8,7, 64-32
1073#endif
1074    or    0,0,8
1075    ld    6,16(5)
1076    std   0,16(4)
1077#ifdef __LITTLE_ENDIAN__
1078    srdi   0,7, 32
1079    sldi   8,6, 64-32
1080#else
1081    sldi   0,7, 32
1082    srdi   8,6, 64-32
1083#endif
1084    or    0,0,8
1085    ld    7,24(5)
1086    std   0,24(4)
1087    addi  5,5,32
1088    addi  4,4,32
1089    bdnz+ L(du4_loop)
1090    .align 4
1091L(du4_fini):
1092    /* calculate and store the final DW */
1093#ifdef __LITTLE_ENDIAN__
1094    srdi   0,6, 32
1095    sldi   8,7, 64-32
1096#else
1097    sldi   0,6, 32
1098    srdi   8,7, 64-32
1099#endif
1100    or    0,0,8
1101    std   0,0(4)
1102    b     L(du_done)
1103
1104    .align 4
1105L(du5_do):
1106    bf      30,L(du5_1dw)
1107
1108    /* there are at least two DWs to copy */
1109#ifdef __LITTLE_ENDIAN__
1110    srdi     0,6, 40
1111    sldi     8,7, 64-40
1112#else
1113    sldi     0,6, 40
1114    srdi     8,7, 64-40
1115#endif
1116    or      0,0,8
1117    ld      6,16(5)
1118    std     0,0(4)
1119#ifdef __LITTLE_ENDIAN__
1120    srdi     0,7, 40
1121    sldi     8,6, 64-40
1122#else
1123    sldi     0,7, 40
1124    srdi     8,6, 64-40
1125#endif
1126    or      0,0,8
1127    ld      7,24(5)
1128    std     0,8(4)
1129    addi    4,4,16
1130    addi    5,5,32
1131    blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
1132    bf      31,L(du5_loop)
1133    /* there is a third DW to copy */
1134#ifdef __LITTLE_ENDIAN__
1135    srdi     0,6, 40
1136    sldi     8,7, 64-40
1137#else
1138    sldi     0,6, 40
1139    srdi     8,7, 64-40
1140#endif
1141    or      0,0,8
1142    std     0,0(4)
1143    mr      6,7
1144    ld      7,0(5)
1145    addi    5,5,8
1146    addi    4,4,8
1147    beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
1148    b       L(du5_loop)
1149    .align 4
1150L(du5_1dw):
1151#ifdef __LITTLE_ENDIAN__
1152    srdi     0,6, 40
1153    sldi     8,7, 64-40
1154#else
1155    sldi     0,6, 40
1156    srdi     8,7, 64-40
1157#endif
1158    addi    5,5,16
1159    or      0,0,8
1160    bf      31,L(du5_loop)
1161    mr      6,7
1162    ld      7,0(5)
1163    addi    5,5,8
1164    std     0,0(4)
1165    addi    4,4,8
1166    .align 4
1167/* copy 32 bytes at a time */
1168L(du5_loop):
1169#ifdef __LITTLE_ENDIAN__
1170    srdi   0,6, 40
1171    sldi   8,7, 64-40
1172#else
1173    sldi   0,6, 40
1174    srdi   8,7, 64-40
1175#endif
1176    or    0,0,8
1177    ld    6,0(5)
1178    std   0,0(4)
1179#ifdef __LITTLE_ENDIAN__
1180    srdi   0,7, 40
1181    sldi   8,6, 64-40
1182#else
1183    sldi   0,7, 40
1184    srdi   8,6, 64-40
1185#endif
1186    or    0,0,8
1187    ld    7,8(5)
1188    std   0,8(4)
1189#ifdef __LITTLE_ENDIAN__
1190    srdi   0,6, 40
1191    sldi   8,7, 64-40
1192#else
1193    sldi   0,6, 40
1194    srdi   8,7, 64-40
1195#endif
1196    or    0,0,8
1197    ld    6,16(5)
1198    std   0,16(4)
1199#ifdef __LITTLE_ENDIAN__
1200    srdi   0,7, 40
1201    sldi   8,6, 64-40
1202#else
1203    sldi   0,7, 40
1204    srdi   8,6, 64-40
1205#endif
1206    or    0,0,8
1207    ld    7,24(5)
1208    std   0,24(4)
1209    addi  5,5,32
1210    addi  4,4,32
1211    bdnz+ L(du5_loop)
1212    .align 4
1213L(du5_fini):
1214    /* calculate and store the final DW */
1215#ifdef __LITTLE_ENDIAN__
1216    srdi   0,6, 40
1217    sldi   8,7, 64-40
1218#else
1219    sldi   0,6, 40
1220    srdi   8,7, 64-40
1221#endif
1222    or    0,0,8
1223    std   0,0(4)
1224    b     L(du_done)
1225
1226    .align 4
1227L(du6_do):
1228    bf      30,L(du6_1dw)
1229
1230    /* there are at least two DWs to copy */
1231#ifdef __LITTLE_ENDIAN__
1232    srdi     0,6, 48
1233    sldi     8,7, 64-48
1234#else
1235    sldi     0,6, 48
1236    srdi     8,7, 64-48
1237#endif
1238    or      0,0,8
1239    ld      6,16(5)
1240    std     0,0(4)
1241#ifdef __LITTLE_ENDIAN__
1242    srdi     0,7, 48
1243    sldi     8,6, 64-48
1244#else
1245    sldi     0,7, 48
1246    srdi     8,6, 64-48
1247#endif
1248    or      0,0,8
1249    ld      7,24(5)
1250    std     0,8(4)
1251    addi    4,4,16
1252    addi    5,5,32
1253    blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
1254    bf      31,L(du6_loop)
1255    /* there is a third DW to copy */
1256#ifdef __LITTLE_ENDIAN__
1257    srdi     0,6, 48
1258    sldi     8,7, 64-48
1259#else
1260    sldi     0,6, 48
1261    srdi     8,7, 64-48
1262#endif
1263    or      0,0,8
1264    std     0,0(4)
1265    mr      6,7
1266    ld      7,0(5)
1267    addi    5,5,8
1268    addi    4,4,8
1269    beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
1270    b       L(du6_loop)
1271    .align 4
1272L(du6_1dw):
1273#ifdef __LITTLE_ENDIAN__
1274    srdi     0,6, 48
1275    sldi     8,7, 64-48
1276#else
1277    sldi     0,6, 48
1278    srdi     8,7, 64-48
1279#endif
1280    addi    5,5,16
1281    or      0,0,8
1282    bf      31,L(du6_loop)
1283    mr      6,7
1284    ld      7,0(5)
1285    addi    5,5,8
1286    std     0,0(4)
1287    addi    4,4,8
1288    .align 4
1289/* copy 32 bytes at a time */
1290L(du6_loop):
1291#ifdef __LITTLE_ENDIAN__
1292    srdi   0,6, 48
1293    sldi   8,7, 64-48
1294#else
1295    sldi   0,6, 48
1296    srdi   8,7, 64-48
1297#endif
1298    or    0,0,8
1299    ld    6,0(5)
1300    std   0,0(4)
1301#ifdef __LITTLE_ENDIAN__
1302    srdi   0,7, 48
1303    sldi   8,6, 64-48
1304#else
1305    sldi   0,7, 48
1306    srdi   8,6, 64-48
1307#endif
1308    or    0,0,8
1309    ld    7,8(5)
1310    std   0,8(4)
1311#ifdef __LITTLE_ENDIAN__
1312    srdi   0,6, 48
1313    sldi   8,7, 64-48
1314#else
1315    sldi   0,6, 48
1316    srdi   8,7, 64-48
1317#endif
1318    or    0,0,8
1319    ld    6,16(5)
1320    std   0,16(4)
1321#ifdef __LITTLE_ENDIAN__
1322    srdi   0,7, 48
1323    sldi   8,6, 64-48
1324#else
1325    sldi   0,7, 48
1326    srdi   8,6, 64-48
1327#endif
1328    or    0,0,8
1329    ld    7,24(5)
1330    std   0,24(4)
1331    addi  5,5,32
1332    addi  4,4,32
1333    bdnz+ L(du6_loop)
1334    .align 4
1335L(du6_fini):
1336    /* calculate and store the final DW */
1337#ifdef __LITTLE_ENDIAN__
1338    srdi   0,6, 48
1339    sldi   8,7, 64-48
1340#else
1341    sldi   0,6, 48
1342    srdi   8,7, 64-48
1343#endif
1344    or    0,0,8
1345    std   0,0(4)
1346    b     L(du_done)
1347
1348    .align 4
1349L(du7_do):
1350    bf      30,L(du7_1dw)
1351
1352    /* there are at least two DWs to copy */
1353#ifdef __LITTLE_ENDIAN__
1354    srdi     0,6, 56
1355    sldi     8,7, 64-56
1356#else
1357    sldi     0,6, 56
1358    srdi     8,7, 64-56
1359#endif
1360    or      0,0,8
1361    ld      6,16(5)
1362    std     0,0(4)
1363#ifdef __LITTLE_ENDIAN__
1364    srdi     0,7, 56
1365    sldi     8,6, 64-56
1366#else
1367    sldi     0,7, 56
1368    srdi     8,6, 64-56
1369#endif
1370    or      0,0,8
1371    ld      7,24(5)
1372    std     0,8(4)
1373    addi    4,4,16
1374    addi    5,5,32
1375    blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
1376    bf      31,L(du7_loop)
1377    /* there is a third DW to copy */
1378#ifdef __LITTLE_ENDIAN__
1379    srdi     0,6, 56
1380    sldi     8,7, 64-56
1381#else
1382    sldi     0,6, 56
1383    srdi     8,7, 64-56
1384#endif
1385    or      0,0,8
1386    std     0,0(4)
1387    mr      6,7
1388    ld      7,0(5)
1389    addi    5,5,8
1390    addi    4,4,8
1391    beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
1392    b       L(du7_loop)
1393    .align 4
1394L(du7_1dw):
1395#ifdef __LITTLE_ENDIAN__
1396    srdi     0,6, 56
1397    sldi     8,7, 64-56
1398#else
1399    sldi     0,6, 56
1400    srdi     8,7, 64-56
1401#endif
1402    addi    5,5,16
1403    or      0,0,8
1404    bf      31,L(du7_loop)
1405    mr      6,7
1406    ld      7,0(5)
1407    addi    5,5,8
1408    std     0,0(4)
1409    addi    4,4,8
1410    .align 4
1411/* copy 32 bytes at a time */
1412L(du7_loop):
1413#ifdef __LITTLE_ENDIAN__
1414    srdi   0,6, 56
1415    sldi   8,7, 64-56
1416#else
1417    sldi   0,6, 56
1418    srdi   8,7, 64-56
1419#endif
1420    or    0,0,8
1421    ld    6,0(5)
1422    std   0,0(4)
1423#ifdef __LITTLE_ENDIAN__
1424    srdi   0,7, 56
1425    sldi   8,6, 64-56
1426#else
1427    sldi   0,7, 56
1428    srdi   8,6, 64-56
1429#endif
1430    or    0,0,8
1431    ld    7,8(5)
1432    std   0,8(4)
1433#ifdef __LITTLE_ENDIAN__
1434    srdi   0,6, 56
1435    sldi   8,7, 64-56
1436#else
1437    sldi   0,6, 56
1438    srdi   8,7, 64-56
1439#endif
1440    or    0,0,8
1441    ld    6,16(5)
1442    std   0,16(4)
1443#ifdef __LITTLE_ENDIAN__
1444    srdi   0,7, 56
1445    sldi   8,6, 64-56
1446#else
1447    sldi   0,7, 56
1448    srdi   8,6, 64-56
1449#endif
1450    or    0,0,8
1451    ld    7,24(5)
1452    std   0,24(4)
1453    addi  5,5,32
1454    addi  4,4,32
1455    bdnz+ L(du7_loop)
1456    .align 4
1457L(du7_fini):
1458    /* calculate and store the final DW */
1459#ifdef __LITTLE_ENDIAN__
1460    srdi   0,6, 56
1461    sldi   8,7, 64-56
1462#else
1463    sldi   0,6, 56
1464    srdi   8,7, 64-56
1465#endif
1466    or    0,0,8
1467    std   0,0(4)
1468    b     L(du_done)
1469
1470    .align 4
1471L(du_done):
1472    rldicr 0,31,0,60
1473    mtcrf 0x01,31
1474    beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
1475
1476    add   3,3,0
1477    add   12,12,0
1478/*  At this point we have a tail of 0-7 bytes and we know that the
1479    destination is double word aligned.  */
14804:  bf    29,2f
1481    lwz   6,0(12)
1482    addi  12,12,4
1483    stw   6,0(3)
1484    addi  3,3,4
14852:  bf    30,1f
1486    lhz   6,0(12)
1487    addi  12,12,2
1488    sth   6,0(3)
1489    addi  3,3,2
14901:  bf    31,0f
1491    lbz   6,0(12)
1492    stb   6,0(3)
14930:
1494  /* Return original dst pointer.  */
1495    ld 31,-8(1)
1496    ld 3,-16(1)
1497    blr
1498END_GEN_TB (MEMCPY,TB_TOCLESS)
1499libc_hidden_builtin_def (memcpy)
1500