1/* Optimized memcpy implementation for CELL BE PowerPC.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#define PREFETCH_AHEAD 6	/* no cache lines SRC prefetching ahead  */
22#define ZERO_AHEAD 4		/* no cache lines DST zeroing ahead  */
23
24/* memcpy routine optimized for CELL-BE-PPC	v2.0
25 *
26 * The CELL PPC core has 1 integer unit and 1 load/store unit
27 * CELL:
28 * 1st level data cache = 32K
29 * 2nd level data cache = 512K
30 * 3rd level data cache = 0K
31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32 * latency to memory is >400 clocks
33 * To improve copy performance we need to prefetch source data
34 * far ahead to hide this latency
35 * For best performance instruction forms ending in "." like "andi."
36 * should be avoided as the are implemented in microcode on CELL.
37 * The below code is loop unrolled for the CELL cache line of 128 bytes
38 */
39
40.align  7
41
42EALIGN (memcpy, 5, 0)
43	CALL_MCOUNT
44
45	dcbt	0,r4		/* Prefetch ONE SRC cacheline  */
46	cmplwi	cr1,r5,16	/* is size < 16 ?  */
47	mr	r6,r3
48	blt+	cr1,.Lshortcopy
49
50.Lbigcopy:
51	neg	r8,r3		/* LS 3 bits = # bytes to 8-byte dest bdry  */
52	clrlwi  r8,r8,32-4	/* align to 16byte boundary  */
53	sub     r7,r4,r3
54	cmplwi	cr0,r8,0
55	beq+	.Ldst_aligned
56
57.Ldst_unaligned:
58	mtcrf	0x01,r8		/* put #bytes to boundary into cr7  */
59	subf	r5,r8,r5
60
61	bf	cr7*4+3,1f
62	lbzx	r0,r7,r6	/* copy 1 byte  */
63	stb	r0,0(r6)
64	addi	r6,r6,1
651:	bf	cr7*4+2,2f
66	lhzx	r0,r7,r6	/* copy 2 byte  */
67	sth	r0,0(r6)
68	addi	r6,r6,2
692:	bf	cr7*4+1,4f
70	lwzx	r0,r7,r6	/* copy 4 byte  */
71	stw	r0,0(r6)
72	addi	r6,r6,4
734:	bf	cr7*4+0,8f
74	lfdx	fp9,r7,r6	/* copy 8 byte  */
75	stfd	fp9,0(r6)
76	addi	r6,r6,8
778:
78	add	r4,r7,r6
79
80.Ldst_aligned:
81
82	cmpwi	cr5,r5,128-1
83
84	neg	r7,r6
85	addi	r6,r6,-8	/* prepare for stfdu  */
86	addi	r4,r4,-8	/* prepare for lfdu  */
87
88	clrlwi  r7,r7,32-7	/* align to cacheline boundary  */
89	ble+	cr5,.Llessthancacheline
90
91	cmplwi	cr6,r7,0
92	subf	r5,r7,r5
93	srwi	r7,r7,4		/* divide size by 16  */
94	srwi	r10,r5,7	/* number of cache lines to copy  */
95
96	cmplwi	r10,0
97	li	r11,0		/* number cachelines to copy with prefetch  */
98	beq	.Lnocacheprefetch
99
100	cmplwi	r10,PREFETCH_AHEAD
101	li	r12,128+8	/* prefetch distance  */
102	ble	.Llessthanmaxprefetch
103
104	subi	r11,r10,PREFETCH_AHEAD
105	li	r10,PREFETCH_AHEAD
106
107.Llessthanmaxprefetch:
108	mtctr	r10
109
110.LprefetchSRC:
111	dcbt    r12,r4
112	addi    r12,r12,128
113	bdnz    .LprefetchSRC
114
115.Lnocacheprefetch:
116	mtctr	r7
117	cmplwi	cr1,r5,128
118	clrlwi  r5,r5,32-7
119	beq	cr6,.Lcachelinealigned
120
121.Laligntocacheline:
122	lfd	fp9,0x08(r4)
123	lfdu	fp10,0x10(r4)
124	stfd	fp9,0x08(r6)
125	stfdu	fp10,0x10(r6)
126	bdnz	.Laligntocacheline
127
128
129.Lcachelinealigned:		/* copy while cache lines  */
130
131	blt-	cr1,.Llessthancacheline	/* size <128  */
132
133.Louterloop:
134	cmpwi   r11,0
135	mtctr	r11
136	beq-	.Lendloop
137
138	li	r11,128*ZERO_AHEAD +8	/* DCBZ dist  */
139
140.align	4
141	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
142.Lloop:				/* Copy aligned body  */
143	dcbt	r12,r4		/* PREFETCH SOURCE some cache lines ahead  */
144	lfd	fp9, 0x08(r4)
145	dcbz	r11,r6
146	lfd	fp10, 0x10(r4)	/* 4 register stride copy is optimal  */
147	lfd	fp11, 0x18(r4)	/* to hide 1st level cache latency.  */
148	lfd	fp12, 0x20(r4)
149	stfd	fp9, 0x08(r6)
150	stfd	fp10, 0x10(r6)
151	stfd	fp11, 0x18(r6)
152	stfd	fp12, 0x20(r6)
153	lfd	fp9, 0x28(r4)
154	lfd	fp10, 0x30(r4)
155	lfd	fp11, 0x38(r4)
156	lfd	fp12, 0x40(r4)
157	stfd	fp9, 0x28(r6)
158	stfd	fp10, 0x30(r6)
159	stfd	fp11, 0x38(r6)
160	stfd	fp12, 0x40(r6)
161	lfd	fp9, 0x48(r4)
162	lfd	fp10, 0x50(r4)
163	lfd	fp11, 0x58(r4)
164	lfd	fp12, 0x60(r4)
165	stfd	fp9, 0x48(r6)
166	stfd	fp10, 0x50(r6)
167	stfd	fp11, 0x58(r6)
168	stfd	fp12, 0x60(r6)
169	lfd	fp9, 0x68(r4)
170	lfd	fp10, 0x70(r4)
171	lfd	fp11, 0x78(r4)
172	lfdu	fp12, 0x80(r4)
173	stfd	fp9, 0x68(r6)
174	stfd	fp10, 0x70(r6)
175	stfd	fp11, 0x78(r6)
176	stfdu	fp12, 0x80(r6)
177
178	bdnz	.Lloop
179
180.Lendloop:
181	cmpwi	r10,0
182	slwi	r10,r10,2	/* adjust from 128 to 32 byte stride  */
183	beq-	.Lendloop2
184	mtctr	r10
185
186.Lloop2:			/* Copy aligned body  */
187	lfd	fp9, 0x08(r4)
188	lfd	fp10, 0x10(r4)
189	lfd	fp11, 0x18(r4)
190	lfdu	fp12, 0x20(r4)
191	stfd	fp9, 0x08(r6)
192	stfd	fp10, 0x10(r6)
193	stfd	fp11, 0x18(r6)
194	stfdu	fp12, 0x20(r6)
195
196	bdnz	.Lloop2
197.Lendloop2:
198
199.Llessthancacheline:		/* less than cache to do ?  */
200	cmplwi	cr0,r5,16
201	srwi	r7,r5,4		/* divide size by 16  */
202	blt-	.Ldo_lt16
203	mtctr	r7
204
205.Lcopy_remaining:
206	lfd	fp9,0x08(r4)
207	lfdu	fp10,0x10(r4)
208	stfd	fp9,0x08(r6)
209	stfdu	fp10,0x10(r6)
210	bdnz	.Lcopy_remaining
211
212.Ldo_lt16:			/* less than 16 ?  */
213	cmplwi	cr0,r5,0	/* copy remaining bytes (0-15)  */
214	beqlr+			/* no rest to copy  */
215	addi	r4,r4,8
216	addi	r6,r6,8
217
218.Lshortcopy:			/* SIMPLE COPY to handle size =< 15 bytes  */
219	mtcrf	0x01,r5
220	sub	r7,r4,r6
221	bf-	cr7*4+0,8f
222	lfdx	fp9,r7,r6	/* copy 8 byte  */
223	stfd	fp9,0(r6)
224	addi	r6,r6,8
2258:
226	bf	cr7*4+1,4f
227	lwzx	r0,r7,r6	/* copy 4 byte  */
228	stw	r0,0(r6)
229	addi	r6,r6,4
2304:
231	bf	cr7*4+2,2f
232	lhzx	r0,r7,r6	/* copy 2 byte  */
233	sth	r0,0(r6)
234	addi	r6,r6,2
2352:
236	bf	cr7*4+3,1f
237	lbzx	r0,r7,r6	/* copy 1 byte  */
238	stb	r0,0(r6)
2391:	blr
240
241END (memcpy)
242libc_hidden_builtin_def (memcpy)
243