1/* Optimized memcpy implementation for PowerPC A2.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <rtld-global-offsets.h>
21
22#ifndef MEMCPY
23# define MEMCPY memcpy
24#endif
25
26#define PREFETCH_AHEAD 4        /* no cache lines SRC prefetching ahead  */
27#define ZERO_AHEAD 2            /* no cache lines DST zeroing ahead  */
28
29	.section        ".toc","aw"
30__GLRO_DEF(dl_cache_line_size)
31
32
33	.section        ".text"
34	.align 2
35
36
37	.machine  a2
38ENTRY (MEMCPY, 5)
39	CALL_MCOUNT 3
40
41	dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
42	cmpldi  cr1,r5,16       /* is size < 16 ?  */
43	mr      r6,r3           /* Copy dest reg to r6; */
44	blt+    cr1,L(shortcopy)
45
46
47	/* Big copy (16 bytes or more)
48
49	   Figure out how far to the nearest quadword boundary, or if we are
50	   on one already.  Also get the cache line size.
51
52	   r3 - return value (always)
53	   r4 - current source addr
54	   r5 - copy length
55	   r6 - current dest addr
56	*/
57
58	neg     r8,r3           /* LS 4 bits = # bytes to 8-byte dest bdry  */
59	/* Get the cache line size.  */
60	__GLRO (r9, dl_cache_line_size,
61		RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
62	clrldi  r8,r8,64-4      /* align to 16byte boundary  */
63	sub     r7,r4,r3        /* compute offset to src from dest */
64	cmpldi  cr0,r8,0        /* Were we aligned on a 16 byte bdy? */
65	addi    r10,r9,-1       /* Cache line mask */
66	beq+    L(dst_aligned)
67
68
69
70	/* Destination is not aligned on quadword boundary.  Get us to one.
71
72	   r3 - return value (always)
73	   r4 - current source addr
74	   r5 - copy length
75	   r6 - current dest addr
76	   r7 - offset to src from dest
77	   r8 - number of bytes to quadword boundary
78	*/
79
80	mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
81	subf    r5,r8,r5        /* adjust remaining len */
82
83	bf      cr7*4+3,1f
84	lbzx    r0,r7,r6        /* copy 1 byte addr */
85	stb     r0,0(r6)
86	addi    r6,r6,1
871:
88	bf      cr7*4+2,2f
89	lhzx    r0,r7,r6        /* copy 2 byte addr */
90	sth     r0,0(r6)
91	addi    r6,r6,2
922:
93	bf      cr7*4+1,4f
94	lwzx    r0,r7,r6        /* copy 4 byte addr */
95	stw     r0,0(r6)
96	addi    r6,r6,4
974:
98	bf      cr7*4+0,8f
99	ldx     r0,r7,r6        /* copy 8 byte addr */
100	std     r0,0(r6)
101	addi    r6,r6,8
1028:
103	add     r4,r7,r6        /* update src addr */
104
105
106
107	/* Dest is quadword aligned now.
108
109	   Lots of decisions to make.  If we are copying less than a cache
110	   line we won't be here long.  If we are not on a cache line
111	   boundary we need to get there.  And then we need to figure out
112	   how many cache lines ahead to pre-touch.
113
114	   r3 - return value (always)
115	   r4 - current source addr
116	   r5 - copy length
117	   r6 - current dest addr
118	*/
119
120
121	.align 4
122L(dst_aligned):
123	cmpdi	cr0,r9,0	/* Cache line size set? */
124	bne+	cr0,L(cachelineset)
125
126/* Cache line size not set: generic byte copy without much optimization */
127	clrldi.	r0,r5,63	/* If length is odd copy one byte */
128	beq	L(cachelinenotset_align)
129	lbz	r7,0(r4)	/* Read one byte from source */
130	addi	r5,r5,-1	/* Update length */
131	addi	r4,r4,1		/* Update source pointer address */
132	stb	r7,0(r6)	/* Store one byte at dest */
133	addi	r6,r6,1		/* Update dest pointer address */
134L(cachelinenotset_align):
135	cmpdi	cr7,r5,0	/* If length is 0 return */
136	beqlr	cr7
137	ori	r2,r2,0		/* Force a new dispatch group */
138L(cachelinenotset_loop):
139	addic.	r5,r5,-2	/* Update length */
140	lbz	r7,0(r4)	/* Load 2 bytes from source */
141	lbz	r8,1(r4)
142	addi	r4,r4,2		/* Update source pointer address */
143	stb	r7,0(r6)	/* Store 2 bytes on dest */
144	stb	r8,1(r6)
145	addi	r6,r6,2		/* Update dest pointer address */
146	bne	L(cachelinenotset_loop)
147	blr
148
149
150L(cachelineset):
151	cmpd	cr5,r5,r10       /* Less than a cacheline to go? */
152
153	neg     r7,r6           /* How far to next cacheline bdy? */
154
155	addi    r6,r6,-8        /* prepare for stdu  */
156	cmpdi   cr0,r9,128
157	addi    r4,r4,-8        /* prepare for ldu  */
158
159
160	ble+    cr5,L(lessthancacheline)
161
162	beq-    cr0,L(big_lines) /* 128 byte line code */
163
164
165
166	/* More than a cacheline left to go, and using 64 byte cachelines */
167
168	clrldi  r7,r7,64-6      /* How far to next cacheline bdy? */
169
170	cmpldi  cr6,r7,0        /* Are we on a cacheline bdy already? */
171
172	/* Reduce total len by what it takes to get to the next cache line */
173	subf    r5,r7,r5
174	srdi    r7,r7,4         /* How many qws to get to the line bdy? */
175
176	/* How many full cache lines to copy after getting to a line bdy? */
177	srdi    r10,r5,6
178
179	cmpldi  r10,0           /* If no full cache lines to copy ... */
180	li      r11,0           /* number cachelines to copy with prefetch  */
181	beq     L(nocacheprefetch)
182
183
184	/* We are here because we have at least one full cache line to copy,
185	   and therefore some pre-touching to do. */
186
187	cmpldi  r10,PREFETCH_AHEAD
188	li      r12,64+8        /* prefetch distance  */
189	ble     L(lessthanmaxprefetch)
190
191	/* We can only do so much pre-fetching.  R11 will have the count of
192	   lines left to prefetch after the initial batch of prefetches
193	   are executed. */
194
195	subi    r11,r10,PREFETCH_AHEAD
196	li      r10,PREFETCH_AHEAD
197
198L(lessthanmaxprefetch):
199	mtctr   r10
200
201	/* At this point r10/ctr hold the number of lines to prefetch in this
202	   initial batch, and r11 holds any remainder. */
203
204L(prefetchSRC):
205	dcbt    r12,r4
206	addi    r12,r12,64
207	bdnz    L(prefetchSRC)
208
209
210	/* Prefetching is done, or was not needed.
211
212	   cr6 - are we on a cacheline boundary already?
213	   r7  - number of quadwords to the next cacheline boundary
214	*/
215
216L(nocacheprefetch):
217	mtctr   r7
218
219	cmpldi  cr1,r5,64   /* Less than a cache line to copy? */
220
221	/* How many bytes are left after we copy whatever full
222	   cache lines we can get? */
223	clrldi  r5,r5,64-6
224
225	beq     cr6,L(cachelinealigned)
226
227
228	/* Copy quadwords up to the next cacheline boundary */
229
230L(aligntocacheline):
231	ld      r9,0x08(r4)
232	ld      r7,0x10(r4)
233	addi    r4,r4,0x10
234	std     r9,0x08(r6)
235	stdu    r7,0x10(r6)
236	bdnz    L(aligntocacheline)
237
238
239	.align 4
240L(cachelinealigned):            /* copy while cache lines  */
241
242	blt-    cr1,L(lessthancacheline) /* size <64  */
243
244L(outerloop):
245	cmpdi   r11,0
246	mtctr   r11
247	beq-    L(endloop)
248
249	li      r11,64*ZERO_AHEAD +8    /* DCBZ dist  */
250
251	.align  4
252	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
253L(loop):                        /* Copy aligned body  */
254	dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
255	ld      r9, 0x08(r4)
256	dcbz    r11,r6
257	ld      r7, 0x10(r4)
258	ld      r8, 0x18(r4)
259	ld      r0, 0x20(r4)
260	std     r9, 0x08(r6)
261	std     r7, 0x10(r6)
262	std     r8, 0x18(r6)
263	std     r0, 0x20(r6)
264	ld      r9, 0x28(r4)
265	ld      r7, 0x30(r4)
266	ld      r8, 0x38(r4)
267	ld      r0, 0x40(r4)
268	addi    r4, r4,0x40
269	std     r9, 0x28(r6)
270	std     r7, 0x30(r6)
271	std     r8, 0x38(r6)
272	stdu    r0, 0x40(r6)
273
274	bdnz    L(loop)
275
276
277L(endloop):
278	cmpdi   r10,0
279	beq-    L(endloop2)
280	mtctr   r10
281
282L(loop2):                       /* Copy aligned body  */
283	ld      r9, 0x08(r4)
284	ld      r7, 0x10(r4)
285	ld      r8, 0x18(r4)
286	ld      r0, 0x20(r4)
287	std     r9, 0x08(r6)
288	std     r7, 0x10(r6)
289	std     r8, 0x18(r6)
290	std     r0, 0x20(r6)
291	ld      r9, 0x28(r4)
292	ld      r7, 0x30(r4)
293	ld      r8, 0x38(r4)
294	ld      r0, 0x40(r4)
295	addi    r4, r4,0x40
296	std     r9, 0x28(r6)
297	std     r7, 0x30(r6)
298	std     r8, 0x38(r6)
299	stdu    r0, 0x40(r6)
300
301	bdnz    L(loop2)
302L(endloop2):
303
304
305	.align 4
306L(lessthancacheline):           /* Was there less than cache to do ?  */
307	cmpldi  cr0,r5,16
308	srdi    r7,r5,4         /* divide size by 16  */
309	blt-    L(do_lt16)
310	mtctr   r7
311
312L(copy_remaining):
313	ld      r8,0x08(r4)
314	ld      r7,0x10(r4)
315	addi    r4,r4,0x10
316	std     r8,0x08(r6)
317	stdu    r7,0x10(r6)
318	bdnz    L(copy_remaining)
319
320L(do_lt16):                     /* less than 16 ?  */
321	cmpldi  cr0,r5,0        /* copy remaining bytes (0-15)  */
322	beqlr+                  /* no rest to copy  */
323	addi    r4,r4,8
324	addi    r6,r6,8
325
326L(shortcopy):                   /* SIMPLE COPY to handle size =< 15 bytes  */
327	mtcrf   0x01,r5
328	sub     r7,r4,r6
329	bf-     cr7*4+0,8f
330	ldx     r0,r7,r6        /* copy 8 byte  */
331	std     r0,0(r6)
332	addi    r6,r6,8
3338:
334	bf      cr7*4+1,4f
335	lwzx    r0,r7,r6        /* copy 4 byte  */
336	stw     r0,0(r6)
337	addi    r6,r6,4
3384:
339	bf      cr7*4+2,2f
340	lhzx    r0,r7,r6        /* copy 2 byte  */
341	sth     r0,0(r6)
342	addi    r6,r6,2
3432:
344	bf      cr7*4+3,1f
345	lbzx    r0,r7,r6        /* copy 1 byte  */
346	stb     r0,0(r6)
3471:
348	blr
349
350
351
352
353
354	/* Similar to above, but for use with 128 byte lines. */
355
356
357L(big_lines):
358
359	clrldi  r7,r7,64-7      /* How far to next cacheline bdy? */
360
361	cmpldi  cr6,r7,0        /* Are we on a cacheline bdy already? */
362
363	/* Reduce total len by what it takes to get to the next cache line */
364	subf    r5,r7,r5
365	srdi    r7,r7,4         /* How many qws to get to the line bdy? */
366
367	/* How many full cache lines to copy after getting to a line bdy? */
368	srdi    r10,r5,7
369
370	cmpldi  r10,0           /* If no full cache lines to copy ... */
371	li      r11,0           /* number cachelines to copy with prefetch  */
372	beq     L(nocacheprefetch_128)
373
374
375	/* We are here because we have at least one full cache line to copy,
376	   and therefore some pre-touching to do. */
377
378	cmpldi  r10,PREFETCH_AHEAD
379	li      r12,128+8       /* prefetch distance  */
380	ble     L(lessthanmaxprefetch_128)
381
382	/* We can only do so much pre-fetching.  R11 will have the count of
383	   lines left to prefetch after the initial batch of prefetches
384	   are executed. */
385
386	subi    r11,r10,PREFETCH_AHEAD
387	li      r10,PREFETCH_AHEAD
388
389L(lessthanmaxprefetch_128):
390	mtctr   r10
391
392	/* At this point r10/ctr hold the number of lines to prefetch in this
393	   initial batch, and r11 holds any remainder. */
394
395L(prefetchSRC_128):
396	dcbt    r12,r4
397	addi    r12,r12,128
398	bdnz    L(prefetchSRC_128)
399
400
401	/* Prefetching is done, or was not needed.
402
403	   cr6 - are we on a cacheline boundary already?
404	   r7  - number of quadwords to the next cacheline boundary
405	*/
406
407L(nocacheprefetch_128):
408	mtctr   r7
409
410	cmpldi  cr1,r5,128  /* Less than a cache line to copy? */
411
412	/* How many bytes are left after we copy whatever full
413	   cache lines we can get? */
414	clrldi  r5,r5,64-7
415
416	beq     cr6,L(cachelinealigned_128)
417
418
419	/* Copy quadwords up to the next cacheline boundary */
420
421L(aligntocacheline_128):
422	ld      r9,0x08(r4)
423	ld      r7,0x10(r4)
424	addi    r4,r4,0x10
425	std     r9,0x08(r6)
426	stdu    r7,0x10(r6)
427	bdnz    L(aligntocacheline_128)
428
429
430L(cachelinealigned_128):        /* copy while cache lines  */
431
432	blt-    cr1,L(lessthancacheline) /* size <128  */
433
434L(outerloop_128):
435	cmpdi   r11,0
436	mtctr   r11
437	beq-    L(endloop_128)
438
439	li      r11,128*ZERO_AHEAD +8    /* DCBZ dist  */
440
441	.align  4
442	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
443L(loop_128):                    /* Copy aligned body  */
444	dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
445	ld      r9, 0x08(r4)
446	dcbz    r11,r6
447	ld      r7, 0x10(r4)
448	ld      r8, 0x18(r4)
449	ld      r0, 0x20(r4)
450	std     r9, 0x08(r6)
451	std     r7, 0x10(r6)
452	std     r8, 0x18(r6)
453	std     r0, 0x20(r6)
454	ld      r9, 0x28(r4)
455	ld      r7, 0x30(r4)
456	ld      r8, 0x38(r4)
457	ld      r0, 0x40(r4)
458	std     r9, 0x28(r6)
459	std     r7, 0x30(r6)
460	std     r8, 0x38(r6)
461	std     r0, 0x40(r6)
462	ld      r9, 0x48(r4)
463	ld      r7, 0x50(r4)
464	ld      r8, 0x58(r4)
465	ld      r0, 0x60(r4)
466	std     r9, 0x48(r6)
467	std     r7, 0x50(r6)
468	std     r8, 0x58(r6)
469	std     r0, 0x60(r6)
470	ld      r9, 0x68(r4)
471	ld      r7, 0x70(r4)
472	ld      r8, 0x78(r4)
473	ld      r0, 0x80(r4)
474	addi    r4, r4,0x80
475	std     r9, 0x68(r6)
476	std     r7, 0x70(r6)
477	std     r8, 0x78(r6)
478	stdu    r0, 0x80(r6)
479
480	bdnz    L(loop_128)
481
482
483L(endloop_128):
484	cmpdi   r10,0
485	beq-    L(endloop2_128)
486	mtctr   r10
487
488L(loop2_128):                       /* Copy aligned body  */
489	ld      r9, 0x08(r4)
490	ld      r7, 0x10(r4)
491	ld      r8, 0x18(r4)
492	ld      r0, 0x20(r4)
493	std     r9, 0x08(r6)
494	std     r7, 0x10(r6)
495	std     r8, 0x18(r6)
496	std     r0, 0x20(r6)
497	ld      r9, 0x28(r4)
498	ld      r7, 0x30(r4)
499	ld      r8, 0x38(r4)
500	ld      r0, 0x40(r4)
501	std     r9, 0x28(r6)
502	std     r7, 0x30(r6)
503	std     r8, 0x38(r6)
504	std     r0, 0x40(r6)
505	ld      r9, 0x48(r4)
506	ld      r7, 0x50(r4)
507	ld      r8, 0x58(r4)
508	ld      r0, 0x60(r4)
509	std     r9, 0x48(r6)
510	std     r7, 0x50(r6)
511	std     r8, 0x58(r6)
512	std     r0, 0x60(r6)
513	ld      r9, 0x68(r4)
514	ld      r7, 0x70(r4)
515	ld      r8, 0x78(r4)
516	ld      r0, 0x80(r4)
517	addi    r4, r4,0x80
518	std     r9, 0x68(r6)
519	std     r7, 0x70(r6)
520	std     r8, 0x78(r6)
521	stdu    r0, 0x80(r6)
522
523	bdnz    L(loop2_128)
524L(endloop2_128):
525
526	b       L(lessthancacheline)
527
528
529END_GEN_TB (MEMCPY,TB_TOCLESS)
530libc_hidden_builtin_def (memcpy)
531