1/* Optimized mempcpy implementation for POWER7.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21
22/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
23    Returns 'dst' + 'len'.  */
24
25#ifndef MEMPCPY
26# define MEMPCPY __mempcpy
27#endif
28	.machine  power7
29ENTRY_TOCLESS (MEMPCPY, 5)
30	CALL_MCOUNT 3
31
32	cmpldi	cr1,5,31
33	neg	0,3
34	std	3,-16(1)
35	std	31,-8(1)
36	cfi_offset(31,-8)
37	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
38				       code.  */
39
40	andi.	11,3,7	      /* Check alignment of DST.  */
41
42
43	clrldi	10,4,61	      /* Check alignment of SRC.  */
44	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
45	mr	12,4
46	mr	31,5
47	bne	cr6,L(copy_GE_32_unaligned)
48
49	srdi	9,5,3	      /* Number of full quadwords remaining.  */
50
51	beq	L(copy_GE_32_aligned_cont)
52
53	clrldi	0,0,61
54	mtcrf	0x01,0
55	subf	31,0,5
56
57	/* Get the SRC aligned to 8 bytes.  */
58
591:	bf	31,2f
60	lbz	6,0(12)
61	addi	12,12,1
62	stb	6,0(3)
63	addi	3,3,1
642:	bf	30,4f
65	lhz	6,0(12)
66	addi	12,12,2
67	sth	6,0(3)
68	addi	3,3,2
694:	bf	29,0f
70	lwz	6,0(12)
71	addi	12,12,4
72	stw	6,0(3)
73	addi	3,3,4
740:
75	clrldi	10,12,61      /* Check alignment of SRC again.  */
76	srdi	9,31,3	      /* Number of full doublewords remaining.  */
77
78L(copy_GE_32_aligned_cont):
79
80	clrldi	11,31,61
81	mtcrf	0x01,9
82
83	srdi	8,31,5
84	cmpldi	cr1,9,4
85	cmpldi	cr6,11,0
86	mr	11,12
87
88	/* Copy 1~3 doublewords so the main loop starts
89	at a multiple of 32 bytes.  */
90
91	bf	30,1f
92	ld	6,0(12)
93	ld	7,8(12)
94	addi	11,12,16
95	mtctr	8
96	std	6,0(3)
97	std	7,8(3)
98	addi	10,3,16
99	bf	31,4f
100	ld	0,16(12)
101	std	0,16(3)
102	blt	cr1,3f
103	addi	11,12,24
104	addi	10,3,24
105	b	4f
106
107	.align	4
1081:	/* Copy 1 doubleword and set the counter.  */
109	mr	10,3
110	mtctr	8
111	bf	31,4f
112	ld	6,0(12)
113	addi	11,12,8
114	std	6,0(3)
115	addi	10,3,8
116
117	/* Main aligned copy loop. Copies 32-bytes at a time.  */
118	.align	4
1194:
120	ld	6,0(11)
121	ld	7,8(11)
122	ld	8,16(11)
123	ld	0,24(11)
124	addi	11,11,32
125
126	std	6,0(10)
127	std	7,8(10)
128	std	8,16(10)
129	std	0,24(10)
130	addi	10,10,32
131	bdnz	4b
1323:
133
134	/* Check for tail bytes.  */
135	rldicr	0,31,0,60
136	mtcrf	0x01,31
137	beq	cr6,0f
138
139.L9:
140	add	3,3,0
141	add	12,12,0
142
143	/*  At this point we have a tail of 0-7 bytes and we know that the
144	destination is doubleword-aligned.  */
1454:	/* Copy 4 bytes.  */
146	bf	29,2f
147
148	lwz	6,0(12)
149	addi	12,12,4
150	stw	6,0(3)
151	addi	3,3,4
1522:	/* Copy 2 bytes.  */
153	bf	30,1f
154
155	lhz	6,0(12)
156	addi	12,12,2
157	sth	6,0(3)
158	addi	3,3,2
1591:	/* Copy 1 byte.  */
160	bf	31,0f
161
162	lbz	6,0(12)
163	stb	6,0(3)
1640:	/* Return DST + LEN pointer.  */
165	ld	31,-8(1)
166	ld	3,-16(1)
167	add	3,3,5
168	blr
169
170	/* Handle copies of 0~31 bytes.  */
171	.align	4
172L(copy_LT_32):
173	cmpldi	cr6,5,8
174	mr	12,4
175	mtcrf	0x01,5
176	ble	cr6,L(copy_LE_8)
177
178	/* At least 9 bytes to go.  */
179	neg	8,4
180	clrrdi	11,4,2
181	andi.	0,8,3
182	cmpldi	cr1,5,16
183	mr	10,5
184	beq	L(copy_LT_32_aligned)
185
186	/* Force 4-bytes alignment for SRC.  */
187	mtocrf  0x01,0
188	subf	10,0,5
1892:	bf	30,1f
190
191	lhz	6,0(12)
192	addi	12,12,2
193	sth	6,0(3)
194	addi	3,3,2
1951:	bf	31,L(end_4bytes_alignment)
196
197	lbz	6,0(12)
198	addi	12,12,1
199	stb	6,0(3)
200	addi	3,3,1
201
202	.align	4
203L(end_4bytes_alignment):
204	cmpldi	cr1,10,16
205	mtcrf	0x01,10
206
207L(copy_LT_32_aligned):
208	/* At least 6 bytes to go, and SRC is word-aligned.  */
209	blt	cr1,8f
210
211	/* Copy 16 bytes.  */
212	lwz	6,0(12)
213	lwz	7,4(12)
214	stw	6,0(3)
215	lwz	8,8(12)
216	stw	7,4(3)
217	lwz	6,12(12)
218	addi	12,12,16
219	stw	8,8(3)
220	stw	6,12(3)
221	addi	3,3,16
2228:	/* Copy 8 bytes.  */
223	bf	28,4f
224
225	lwz	6,0(12)
226	lwz	7,4(12)
227	addi	12,12,8
228	stw	6,0(3)
229	stw	7,4(3)
230	addi	3,3,8
2314:	/* Copy 4 bytes.  */
232	bf	29,2f
233
234	lwz	6,0(12)
235	addi	12,12,4
236	stw	6,0(3)
237	addi	3,3,4
2382:	/* Copy 2-3 bytes.  */
239	bf	30,1f
240
241	lhz	6,0(12)
242	sth	6,0(3)
243	bf	31,0f
244	lbz	7,2(12)
245	stb	7,2(3)
246	ld	3,-16(1)
247	add	3,3,5
248	blr
249
250	.align	4
2511:	/* Copy 1 byte.  */
252	bf	31,0f
253
254	lbz	6,0(12)
255	stb	6,0(3)
2560:	/* Return DST + LEN pointer.  */
257	ld	3,-16(1)
258	add	3,3,5
259	blr
260
261	/* Handles copies of 0~8 bytes.  */
262	.align	4
263L(copy_LE_8):
264	bne	cr6,4f
265
266	/* Though we could've used ld/std here, they are still
267	slow for unaligned cases.  */
268
269	lwz	6,0(4)
270	lwz	7,4(4)
271	stw	6,0(3)
272	stw	7,4(3)
273	ld	3,-16(1)      /* Return DST + LEN pointer.  */
274	add	3,3,5
275	blr
276
277	.align	4
2784:	/* Copies 4~7 bytes.  */
279	bf	29,2b
280
281	lwz	6,0(4)
282	stw	6,0(3)
283	bf	30,5f
284	lhz	7,4(4)
285	sth	7,4(3)
286	bf	31,0f
287	lbz	8,6(4)
288	stb	8,6(3)
289	ld	3,-16(1)
290	add	3,3,5
291	blr
292
293	.align	4
2945:	/* Copy 1 byte.  */
295	bf	31,0f
296
297	lbz	6,4(4)
298	stb	6,4(3)
299
3000:	/* Return DST + LEN pointer.  */
301	ld	3,-16(1)
302	add	3,3,5
303	blr
304
305	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
306	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
307	the data, allowing for aligned DST stores.  */
308	.align	4
309L(copy_GE_32_unaligned):
310	clrldi	0,0,60	      /* Number of bytes until the 1st
311				 quadword.  */
312	andi.	11,3,15	      /* Check alignment of DST (against
313				 quadwords).  */
314	srdi	9,5,4	      /* Number of full quadwords remaining.  */
315
316	beq	L(copy_GE_32_unaligned_cont)
317
318	/* SRC is not quadword aligned, get it aligned.  */
319
320	mtcrf	0x01,0
321	subf	31,0,5
322
323	/* Vector instructions work best when proper alignment (16-bytes)
324	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
3251:	/* Copy 1 byte.  */
326	bf	31,2f
327
328	lbz	6,0(12)
329	addi	12,12,1
330	stb	6,0(3)
331	addi	3,3,1
3322:	/* Copy 2 bytes.  */
333	bf	30,4f
334
335	lhz	6,0(12)
336	addi	12,12,2
337	sth	6,0(3)
338	addi	3,3,2
3394:	/* Copy 4 bytes.  */
340	bf	29,8f
341
342	lwz	6,0(12)
343	addi	12,12,4
344	stw	6,0(3)
345	addi	3,3,4
3468:	/* Copy 8 bytes.  */
347	bf	28,0f
348
349	ld	6,0(12)
350	addi	12,12,8
351	std	6,0(3)
352	addi	3,3,8
3530:
354	clrldi	10,12,60      /* Check alignment of SRC.  */
355	srdi	9,31,4	      /* Number of full quadwords remaining.  */
356
357	/* The proper alignment is present, it is OK to copy the bytes now.  */
358L(copy_GE_32_unaligned_cont):
359
360	/* Setup two indexes to speed up the indexed vector operations.  */
361	clrldi	11,31,60
362	li	6,16	      /* Index for 16-bytes offsets.  */
363	li	7,32	      /* Index for 32-bytes offsets.  */
364	cmpldi	cr1,11,0
365	srdi	8,31,5	      /* Setup the loop counter.  */
366	mr	10,3
367	mr	11,12
368	mtcrf	0x01,9
369	cmpldi	cr6,9,1
370#ifdef __LITTLE_ENDIAN__
371	lvsr    5,0,12
372#else
373	lvsl    5,0,12
374#endif
375	lvx	3,0,12
376	bf	31,L(setup_unaligned_loop)
377
378	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
379	lvx	4,12,6
380#ifdef __LITTLE_ENDIAN__
381	vperm   6,4,3,5
382#else
383	vperm   6,3,4,5
384#endif
385	addi	11,12,16
386	addi	10,3,16
387	stvx	6,0,3
388	vor	3,4,4
389
390L(setup_unaligned_loop):
391	mtctr	8
392	ble	cr6,L(end_unaligned_loop)
393
394	/* Copy 32 bytes at a time using vector instructions.  */
395	.align	4
396L(unaligned_loop):
397
398	/* Note: vr6/vr10 may contain data that was already copied,
399	but in order to get proper alignment, we may have to copy
400	some portions again. This is faster than having unaligned
401	vector instructions though.  */
402
403	lvx	4,11,6	      /* vr4 = r11+16.  */
404#ifdef __LITTLE_ENDIAN__
405	vperm   6,4,3,5
406#else
407	vperm   6,3,4,5
408#endif
409	lvx	3,11,7	      /* vr3 = r11+32.  */
410#ifdef __LITTLE_ENDIAN__
411	vperm   10,3,4,5
412#else
413	vperm   10,4,3,5
414#endif
415	addi	11,11,32
416	stvx	6,0,10
417	stvx	10,10,6
418	addi	10,10,32
419
420	bdnz	L(unaligned_loop)
421
422	.align	4
423L(end_unaligned_loop):
424
425	/* Check for tail bytes.  */
426	rldicr	0,31,0,59
427	mtcrf	0x01,31
428	beq	cr1,0f
429
430	add	3,3,0
431	add	12,12,0
432
433	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
4348:	/* Copy 8 bytes.  */
435	bf	28,4f
436
437	lwz	6,0(12)
438	lwz	7,4(12)
439	addi	12,12,8
440	stw	6,0(3)
441	stw	7,4(3)
442	addi	3,3,8
4434:	/* Copy 4 bytes.  */
444	bf	29,2f
445
446	lwz	6,0(12)
447	addi	12,12,4
448	stw	6,0(3)
449	addi	3,3,4
4502:	/* Copy 2~3 bytes.  */
451	bf	30,1f
452
453	lhz	6,0(12)
454	addi	12,12,2
455	sth	6,0(3)
456	addi	3,3,2
4571:	/* Copy 1 byte.  */
458	bf	31,0f
459
460	lbz	6,0(12)
461	stb	6,0(3)
4620:	/* Return DST + LEN pointer.  */
463	ld	31,-8(1)
464	ld	3,-16(1)
465	add	3,3,5
466	blr
467
468END_GEN_TB (MEMPCPY,TB_TOCLESS)
469libc_hidden_def (__mempcpy)
470weak_alias (__mempcpy, mempcpy)
471libc_hidden_builtin_def (mempcpy)
472