1/* Optimized memcpy implementation for PowerPC A2.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <rtld-global-offsets.h>
21
22#define PREFETCH_AHEAD 4        /* no cache lines SRC prefetching ahead  */
23#define ZERO_AHEAD 2            /* no cache lines DST zeroing ahead  */
24
25	.machine  a2
26EALIGN (memcpy, 5, 0)
27	CALL_MCOUNT
28
29	dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
30	cmplwi  cr1,r5,16       /* is size < 16 ?  */
31	mr      r6,r3           /* Copy dest reg to r6; */
32	blt+    cr1,L(shortcopy)
33
34
35	/* Big copy (16 bytes or more)
36
37	   Figure out how far to the nearest quadword boundary, or if we are
38	   on one already.
39
40	   r3 - return value (always)
41	   r4 - current source addr
42	   r5 - copy length
43	   r6 - current dest addr
44	*/
45
46	neg     r8,r3           /* LS 4 bits = # bytes to 8-byte dest bdry  */
47	clrlwi  r8,r8,32-4      /* align to 16byte boundary  */
48	sub     r7,r4,r3        /* compute offset to src from dest */
49	cmplwi  cr0,r8,0        /* Were we aligned on a 16 byte bdy? */
50	beq+    L(dst_aligned)
51
52
53
54	/* Destination is not aligned on quadword boundary.  Get us to one.
55
56	   r3 - return value (always)
57	   r4 - current source addr
58	   r5 - copy length
59	   r6 - current dest addr
60	   r7 - offset to src from dest
61	   r8 - number of bytes to quadword boundary
62	*/
63
64	mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
65	subf    r5,r8,r5        /* adjust remaining len */
66
67	bf      cr7*4+3,1f
68	lbzx    r0,r7,r6        /* copy 1 byte addr */
69	stb     r0,0(r6)
70	addi    r6,r6,1
711:
72	bf      cr7*4+2,2f
73	lhzx    r0,r7,r6        /* copy 2 byte addr */
74	sth     r0,0(r6)
75	addi    r6,r6,2
762:
77	bf      cr7*4+1,4f
78	lwzx    r0,r7,r6        /* copy 4 byte addr */
79	stw     r0,0(r6)
80	addi    r6,r6,4
814:
82	bf      cr7*4+0,8f
83	lfdx    r0,r7,r6        /* copy 8 byte addr */
84	stfd    r0,0(r6)
85	addi    r6,r6,8
868:
87	add     r4,r7,r6        /* update src addr */
88
89
90
91	/* Dest is quadword aligned now.
92
93	   Lots of decisions to make.  If we are copying less than a cache
94	   line we won't be here long.  If we are not on a cache line
95	   boundary we need to get there.  And then we need to figure out
96	   how many cache lines ahead to pre-touch.
97
98	   r3 - return value (always)
99	   r4 - current source addr
100	   r5 - copy length
101	   r6 - current dest addr
102	*/
103
104
105	.align  4
106L(dst_aligned):
107
108
109#ifdef PIC
110	mflr    r0
111/* Establishes GOT addressability so we can load the cache line size
112   from rtld_global_ro.  This value was set from the aux vector during
113   startup.  */
114	SETUP_GOT_ACCESS(r9,got_label)
115	addis	r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@ha
116	addi	r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@l
117	mtlr	r0
118#endif
119	__GLRO(r9, r9, _dl_cache_line_size,
120	       RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
121
122	cmplwi  cr5, r9, 0
123	bne+    cr5,L(cachelineset)
124
125/* Cache line size not set: generic byte copy without much optimization */
126	andi.	r0,r5,1		/* If length is odd copy one byte.  */
127	beq	L(cachelinenotset_align)
128	lbz	r7,0(r4)	/* Read one byte from source.  */
129	addi	r5,r5,-1	/* Update length.  */
130	addi	r4,r4,1		/* Update source pointer address.  */
131	stb	r7,0(r6)	/* Store one byte on dest.  */
132	addi	r6,r6,1		/* Update dest pointer address.  */
133L(cachelinenotset_align):
134	cmpwi   cr7,r5,0	/* If length is 0 return.  */
135	beqlr	cr7
136	ori	r2,r2,0		/* Force a new dispatch group.  */
137L(cachelinenotset_loop):
138	addic.	r5,r5,-2	/* Update length.  */
139	lbz	r7,0(r4)	/* Load 2 bytes from source.  */
140	lbz	r8,1(r4)
141	addi	r4,r4,2		/* Update source pointer address.  */
142	stb	r7,0(r6)	/* Store 2 bytes on dest.  */
143	stb	r8,1(r6)
144	addi	r6,r6,2		/* Update dest pointer address.  */
145	bne	L(cachelinenotset_loop)
146	blr
147
148
149L(cachelineset):
150
151	addi   r10,r9,-1
152
153	cmpw   cr5,r5,r10       /* Less than a cacheline to go? */
154
155	neg     r7,r6           /* How far to next cacheline bdy? */
156
157	addi    r6,r6,-8        /* prepare for stdu  */
158	cmpwi   cr0,r9,128
159	addi    r4,r4,-8        /* prepare for ldu  */
160
161
162	ble+    cr5,L(lessthancacheline)
163
164	beq-    cr0,L(big_lines) /* 128 byte line code */
165
166
167
168
169	/* More than a cacheline left to go, and using 64 byte cachelines */
170
171	clrlwi  r7,r7,32-6      /* How far to next cacheline bdy? */
172
173	cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
174
175	/* Reduce total len by what it takes to get to the next cache line */
176	subf    r5,r7,r5
177	srwi    r7,r7,4         /* How many qws to get to the line bdy? */
178
179	/* How many full cache lines to copy after getting to a line bdy? */
180	srwi    r10,r5,6
181
182	cmplwi  r10,0           /* If no full cache lines to copy ... */
183	li      r11,0           /* number cachelines to copy with prefetch  */
184	beq     L(nocacheprefetch)
185
186
187	/* We are here because we have at least one full cache line to copy,
188	   and therefore some pre-touching to do. */
189
190	cmplwi  r10,PREFETCH_AHEAD
191	li      r12,64+8        /* prefetch distance  */
192	ble     L(lessthanmaxprefetch)
193
194	/* We can only do so much pre-fetching.  R11 will have the count of
195	   lines left to prefetch after the initial batch of prefetches
196	   are executed. */
197
198	subi    r11,r10,PREFETCH_AHEAD
199	li      r10,PREFETCH_AHEAD
200
201L(lessthanmaxprefetch):
202	mtctr   r10
203
204	/* At this point r10/ctr hold the number of lines to prefetch in this
205	   initial batch, and r11 holds any remainder. */
206
207L(prefetchSRC):
208	dcbt    r12,r4
209	addi    r12,r12,64
210	bdnz    L(prefetchSRC)
211
212
213	/* Prefetching is done, or was not needed.
214
215	   cr6 - are we on a cacheline boundary already?
216	   r7  - number of quadwords to the next cacheline boundary
217	*/
218
219L(nocacheprefetch):
220	mtctr   r7
221
222	cmplwi  cr1,r5,64   /* Less than a cache line to copy? */
223
224	/* How many bytes are left after we copy whatever full
225	   cache lines we can get? */
226	clrlwi  r5,r5,32-6
227
228	beq     cr6,L(cachelinealigned)
229
230
231	/* Copy quadwords up to the next cacheline boundary */
232
233L(aligntocacheline):
234	lfd     fp9,0x08(r4)
235	lfdu    fp10,0x10(r4)
236	stfd    fp9,0x08(r6)
237	stfdu   fp10,0x10(r6)
238	bdnz    L(aligntocacheline)
239
240
241	.align 4
242L(cachelinealigned):            /* copy while cache lines  */
243
244	blt-    cr1,L(lessthancacheline) /* size <64  */
245
246L(outerloop):
247	cmpwi   r11,0
248	mtctr   r11
249	beq-    L(endloop)
250
251	li      r11,64*ZERO_AHEAD +8    /* DCBZ dist  */
252
253	.align  4
254	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
255L(loop):                        /* Copy aligned body  */
256	dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
257	lfd     fp9,  0x08(r4)
258	dcbz    r11,r6
259	lfd     fp10, 0x10(r4)
260	lfd     fp11, 0x18(r4)
261	lfd     fp12, 0x20(r4)
262	stfd    fp9,  0x08(r6)
263	stfd    fp10, 0x10(r6)
264	stfd    fp11, 0x18(r6)
265	stfd    fp12, 0x20(r6)
266	lfd     fp9,  0x28(r4)
267	lfd     fp10, 0x30(r4)
268	lfd     fp11, 0x38(r4)
269	lfdu    fp12, 0x40(r4)
270	stfd    fp9,  0x28(r6)
271	stfd    fp10, 0x30(r6)
272	stfd    fp11, 0x38(r6)
273	stfdu   fp12, 0x40(r6)
274
275	bdnz    L(loop)
276
277
278L(endloop):
279	cmpwi   r10,0
280	beq-    L(endloop2)
281	mtctr   r10
282
283L(loop2):                       /* Copy aligned body  */
284	lfd     fp9,  0x08(r4)
285	lfd     fp10, 0x10(r4)
286	lfd     fp11, 0x18(r4)
287	lfd     fp12, 0x20(r4)
288	stfd    fp9,  0x08(r6)
289	stfd    fp10, 0x10(r6)
290	stfd    fp11, 0x18(r6)
291	stfd    fp12, 0x20(r6)
292	lfd     fp9,  0x28(r4)
293	lfd     fp10, 0x30(r4)
294	lfd     fp11, 0x38(r4)
295	lfdu    fp12, 0x40(r4)
296	stfd    fp9,  0x28(r6)
297	stfd    fp10, 0x30(r6)
298	stfd    fp11, 0x38(r6)
299	stfdu   fp12, 0x40(r6)
300
301	bdnz    L(loop2)
302L(endloop2):
303
304
305	.align  4
306L(lessthancacheline):           /* Was there less than cache to do ?  */
307	cmplwi  cr0,r5,16
308	srwi    r7,r5,4         /* divide size by 16  */
309	blt-    L(do_lt16)
310	mtctr   r7
311
312L(copy_remaining):
313	lfd     fp9,  0x08(r4)
314	lfdu    fp10, 0x10(r4)
315	stfd    fp9,  0x08(r6)
316	stfdu   fp10, 0x10(r6)
317	bdnz    L(copy_remaining)
318
319L(do_lt16):                     /* less than 16 ?  */
320	cmplwi  cr0,r5,0        /* copy remaining bytes (0-15)  */
321	beqlr+                  /* no rest to copy  */
322	addi    r4,r4,8
323	addi    r6,r6,8
324
325L(shortcopy):                   /* SIMPLE COPY to handle size =< 15 bytes  */
326	mtcrf   0x01,r5
327	sub     r7,r4,r6
328	bf-     cr7*4+0,8f
329	lfdx    fp9,r7,r6       /* copy 8 byte  */
330	stfd    fp9,0(r6)
331	addi    r6,r6,8
3328:
333	bf      cr7*4+1,4f
334	lwzx    r0,r7,r6        /* copy 4 byte  */
335	stw     r0,0(r6)
336	addi    r6,r6,4
3374:
338	bf      cr7*4+2,2f
339	lhzx    r0,r7,r6        /* copy 2 byte  */
340	sth     r0,0(r6)
341	addi    r6,r6,2
3422:
343	bf      cr7*4+3,1f
344	lbzx    r0,r7,r6        /* copy 1 byte  */
345	stb     r0,0(r6)
3461:
347	blr
348
349
350
351
352
353	/* Similar to above, but for use with 128 byte lines. */
354
355
356L(big_lines):
357
358	clrlwi  r7,r7,32-7      /* How far to next cacheline bdy? */
359
360	cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
361
362	/* Reduce total len by what it takes to get to the next cache line */
363	subf    r5,r7,r5
364	srwi    r7,r7,4         /* How many qw to get to the line bdy? */
365
366	/* How many full cache lines to copy after getting to a line bdy? */
367	srwi    r10,r5,7
368
369	cmplwi  r10,0           /* If no full cache lines to copy ... */
370	li      r11,0           /* number cachelines to copy with prefetch  */
371	beq     L(nocacheprefetch_128)
372
373
374	/* We are here because we have at least one full cache line to copy,
375	   and therefore some pre-touching to do. */
376
377	cmplwi  r10,PREFETCH_AHEAD
378	li      r12,128+8       /* prefetch distance  */
379	ble     L(lessthanmaxprefetch_128)
380
381	/* We can only do so much pre-fetching.  R11 will have the count of
382	   lines left to prefetch after the initial batch of prefetches
383	   are executed. */
384
385	subi    r11,r10,PREFETCH_AHEAD
386	li      r10,PREFETCH_AHEAD
387
388L(lessthanmaxprefetch_128):
389	mtctr   r10
390
391	/* At this point r10/ctr hold the number of lines to prefetch in this
392	   initial batch, and r11 holds any remainder. */
393
394L(prefetchSRC_128):
395	dcbt    r12,r4
396	addi    r12,r12,128
397	bdnz    L(prefetchSRC_128)
398
399
400	/* Prefetching is done, or was not needed.
401
402	   cr6 - are we on a cacheline boundary already?
403	   r7  - number of quadwords to the next cacheline boundary
404	*/
405
406L(nocacheprefetch_128):
407	mtctr   r7
408
409	cmplwi  cr1,r5,128  /* Less than a cache line to copy? */
410
411	/* How many bytes are left after we copy whatever full
412	   cache lines we can get? */
413	clrlwi  r5,r5,32-7
414
415	beq     cr6,L(cachelinealigned_128)
416
417
418	/* Copy quadwords up to the next cacheline boundary */
419
420L(aligntocacheline_128):
421	lfd     fp9,0x08(r4)
422	lfdu    fp10,0x10(r4)
423	stfd    fp9,0x08(r6)
424	stfdu   fp10,0x10(r6)
425	bdnz    L(aligntocacheline_128)
426
427
428L(cachelinealigned_128):        /* copy while cache lines  */
429
430	blt-    cr1,L(lessthancacheline) /* size <128  */
431
432L(outerloop_128):
433	cmpwi   r11,0
434	mtctr   r11
435	beq-    L(endloop_128)
436
437	li      r11,128*ZERO_AHEAD +8    /* DCBZ dist  */
438
439	.align  4
440	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
441L(loop_128):                    /* Copy aligned body  */
442	dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
443	lfd     fp9,  0x08(r4)
444	dcbz    r11,r6
445	lfd     fp10, 0x10(r4)
446	lfd     fp11, 0x18(r4)
447	lfd     fp12, 0x20(r4)
448	stfd    fp9,  0x08(r6)
449	stfd    fp10, 0x10(r6)
450	stfd    fp11, 0x18(r6)
451	stfd    fp12, 0x20(r6)
452	lfd     fp9,  0x28(r4)
453	lfd     fp10, 0x30(r4)
454	lfd     fp11, 0x38(r4)
455	lfd     fp12, 0x40(r4)
456	stfd    fp9,  0x28(r6)
457	stfd    fp10, 0x30(r6)
458	stfd    fp11, 0x38(r6)
459	stfd    fp12, 0x40(r6)
460	lfd     fp9,  0x48(r4)
461	lfd     fp10, 0x50(r4)
462	lfd     fp11, 0x58(r4)
463	lfd     fp12, 0x60(r4)
464	stfd    fp9,  0x48(r6)
465	stfd    fp10, 0x50(r6)
466	stfd    fp11, 0x58(r6)
467	stfd    fp12, 0x60(r6)
468	lfd     fp9,  0x68(r4)
469	lfd     fp10, 0x70(r4)
470	lfd     fp11, 0x78(r4)
471	lfdu    fp12, 0x80(r4)
472	stfd    fp9,  0x68(r6)
473	stfd    fp10, 0x70(r6)
474	stfd    fp11, 0x78(r6)
475	stfdu   fp12, 0x80(r6)
476
477	bdnz    L(loop_128)
478
479
480L(endloop_128):
481	cmpwi   r10,0
482	beq-    L(endloop2_128)
483	mtctr   r10
484
485L(loop2_128):                   /* Copy aligned body  */
486	lfd     fp9,  0x08(r4)
487	lfd     fp10, 0x10(r4)
488	lfd     fp11, 0x18(r4)
489	lfd     fp12, 0x20(r4)
490	stfd    fp9,  0x08(r6)
491	stfd    fp10, 0x10(r6)
492	stfd    fp11, 0x18(r6)
493	stfd    fp12, 0x20(r6)
494	lfd     fp9,  0x28(r4)
495	lfd     fp10, 0x30(r4)
496	lfd     fp11, 0x38(r4)
497	lfd     fp12, 0x40(r4)
498	stfd    fp9,  0x28(r6)
499	stfd    fp10, 0x30(r6)
500	stfd    fp11, 0x38(r6)
501	stfd    fp12, 0x40(r6)
502	lfd     fp9,  0x48(r4)
503	lfd     fp10, 0x50(r4)
504	lfd     fp11, 0x58(r4)
505	lfd     fp12, 0x60(r4)
506	stfd    fp9,  0x48(r6)
507	stfd    fp10, 0x50(r6)
508	stfd    fp11, 0x58(r6)
509	stfd    fp12, 0x60(r6)
510	lfd     fp9,  0x68(r4)
511	lfd     fp10, 0x70(r4)
512	lfd     fp11, 0x78(r4)
513	lfdu    fp12, 0x80(r4)
514	stfd    fp9,  0x68(r6)
515	stfd    fp10, 0x70(r6)
516	stfd    fp11, 0x78(r6)
517	stfdu   fp12, 0x80(r6)
518	bdnz    L(loop2_128)
519L(endloop2_128):
520
521	b       L(lessthancacheline)
522
523
524END (memcpy)
525libc_hidden_builtin_def (memcpy)
526