1/* Optimized memcpy implementation for cached memory on PowerPC64/POWER8.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21
22/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
23   Returns 'dst'.  */
24
25	.machine power8
26ENTRY_TOCLESS (__memcpy_power8_cached, 5)
27	CALL_MCOUNT 3
28
29	cmpldi	cr7,r5,15
30	bgt	cr7,L(ge_16)
31	andi.	r9,r5,0x1
32	mr	r9,r3
33	beq	cr0,1f
34	lbz	r10,0(r4)
35	addi	r9,r3,1
36	addi	r4,r4,1
37	stb	r10,0(r3)
381:
39	andi.	r10,r5,0x2
40	beq	cr0,2f
41	lhz	r10,0(r4)
42	addi	r9,r9,2
43	addi	r4,r4,2
44	sth	r10,-2(r9)
452:
46	andi.	r10,r5,0x4
47	beq	cr0,3f
48	lwz	r10,0(r4)
49	addi	r9,9,4
50	addi	r4,4,4
51	stw	r10,-4(r9)
523:
53	andi.	r10,r5,0x8
54	beqlr	cr0
55	ld	r10,0(r4)
56	std	r10,0(r9)
57	blr
58
59	.align 4
60L(ge_16):
61	cmpldi	cr7,r5,32
62	ble	cr7,L(ge_16_le_32)
63	cmpldi	cr7,r5,64
64	ble	cr7,L(gt_32_le_64)
65
66	/* Align dst to 16 bytes.  */
67	andi.	r9,r3,0xf
68	mr	r12,r3
69	beq	cr0,L(dst_is_align_16)
70	lxvd2x	v0,0,r4
71	subfic	r12,r9,16
72	subf	r5,r12,r5
73	add	r4,r4,r12
74	add	r12,r3,r12
75	stxvd2x	v0,0,r3
76L(dst_is_align_16):
77	cmpldi	cr7,r5,127
78	ble	cr7,L(tail_copy)
79	mr	r9,r12
80	srdi	r10,r5,7
81	li	r11,16
82	li	r6,32
83	li	r7,48
84	mtctr	r10
85	clrrdi	r0,r5,7
86
87	/* Main loop, copy 128 bytes each time.  */
88	.align 4
89L(copy_128):
90	lxvd2x	v10,0,r4
91	lxvd2x	v11,r4,r11
92	addi	r8,r4,64
93	addi	r10,r9,64
94	lxvd2x	v12,r4,r6
95	lxvd2x	v0,r4,r7
96	addi	r4,r4,128
97	stxvd2x v10,0,r9
98	stxvd2x v11,r9,r11
99	stxvd2x v12,r9,r6
100	stxvd2x v0,r9,r7
101	addi	r9,r9,128
102	lxvd2x	v10,0,r8
103	lxvd2x	v11,r8,r11
104	lxvd2x	v12,r8,r6
105	lxvd2x	v0,r8,r7
106	stxvd2x v10,0,r10
107	stxvd2x v11,r10,r11
108	stxvd2x v12,r10,r6
109	stxvd2x v0,r10,r7
110	bdnz	L(copy_128)
111
112	add	r12,r12,r0
113	rldicl 	r5,r5,0,57
114L(tail_copy):
115	cmpldi	cr7,r5,63
116	ble	cr7,L(tail_le_64)
117	li	r8,16
118	li	r10,32
119	lxvd2x	v10,0,r4
120	li	r9,48
121	addi	r5,r5,-64
122	lxvd2x	v11,r4,r8
123	lxvd2x	v12,r4,r10
124	lxvd2x	v0,r4,r9
125	addi	r4,r4,64
126	stxvd2x	v10,0,r12
127	stxvd2x	v11,r12,r8
128	stxvd2x	v12,r12,r10
129	stxvd2x	v0,r12,9
130	addi	r12,r12,64
131
132L(tail_le_64):
133	cmpldi	cr7,r5,32
134	bgt	cr7,L(tail_gt_32_le_64)
135	cmpdi	cr7,r5,0
136	beqlr	cr7
137	addi	r5,r5,-32
138	li	r9,16
139	add	r8,r4,r5
140	add	r10,r12,r5
141	lxvd2x	v12,r4,r5
142	lxvd2x	v0,r8,r9
143	stxvd2x	v12,r12,r5
144	stxvd2x	v0,r10,r9
145	blr
146
147	.align 4
148L(ge_16_le_32):
149	addi	r5,r5,-16
150	lxvd2x	v0,0,r4
151	lxvd2x	v1,r4,r5
152	stxvd2x	v0,0,r3
153	stxvd2x	v1,r3,r5
154	blr
155
156	.align 4
157L(gt_32_le_64):
158	mr	r12,r3
159
160	.align 4
161L(tail_gt_32_le_64):
162	li	r9,16
163	lxvd2x	v0,0,r4
164	addi	r5,r5,-32
165	lxvd2x	v1,r4,r9
166	add	r8,r4,r5
167	lxvd2x	v2,r4,r5
168	add	r10,r12,r5
169	lxvd2x	v3,r8,r9
170	stxvd2x	v0,0,r12
171	stxvd2x	v1,r12,r9
172	stxvd2x	v2,r12,r5
173	stxvd2x	v3,r10,r9
174	blr
175
176END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS)
177