1/* Optimized memset implementation for PowerPC64/POWER7.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.  */
23
24#ifndef MEMSET
25# define MEMSET memset
26#endif
27	.machine power7
28ENTRY_TOCLESS (MEMSET, 5)
29	CALL_MCOUNT 3
30
31L(_memset):
32	cmpldi	cr7,5,31
33	cmpldi	cr6,5,8
34	mr	10,3
35
36	/* Replicate byte to word.  */
37	insrdi	4,4,8,48
38	insrdi	4,4,16,32
39	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
40
41	neg	0,3
42	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
43
44	andi.	11,10,7		/* Check alignment of SRC.  */
45	insrdi	4,4,32,0	/* Replicate word to double word.  */
46
47	mr	12,5
48	beq	L(big_aligned)
49
50	clrldi	0,0,61
51	mtocrf	0x01,0
52	subf	5,0,5
53
54	/* Get DST aligned to 8 bytes.  */
551:	bf	31,2f
56
57	stb	4,0(10)
58	addi	10,10,1
592:	bf	30,4f
60
61	sth	4,0(10)
62	addi	10,10,2
634:	bf	29,L(big_aligned)
64
65	stw	4,0(10)
66	addi	10,10,4
67
68	.align	4
69L(big_aligned):
70
71	cmpldi	cr5,5,255
72	li	0,32
73	dcbtst	0,10
74	cmpldi	cr6,4,0
75	srdi	9,5,3	/* Number of full doublewords remaining.  */
76	crand	27,26,21
77	mtocrf	0x01,9
78	bt	27,L(huge)
79
80	/* From this point on, we'll copy 32+ bytes and the value
81	   isn't 0 (so we can't use dcbz).  */
82
83	srdi	8,5,5
84	clrldi	11,5,61
85	cmpldi	cr6,11,0
86	cmpldi	cr1,9,4
87	mtctr	8
88
89	/* Copy 1~3 doublewords so the main loop starts
90	at a multiple of 32 bytes.  */
91
92	bf	30,1f
93
94	std	4,0(10)
95	std	4,8(10)
96	addi	10,10,16
97	bf	31,L(big_loop)
98
99	std	4,0(10)
100	addi	10,10,8
101	mr	12,10
102	blt	cr1,L(tail_bytes)
103	b	L(big_loop)
104
105	.align	4
1061:	/* Copy 1 doubleword.  */
107	bf	31,L(big_loop)
108
109	std	4,0(10)
110	addi	10,10,8
111
112	/* Main aligned copy loop.  Copies 32-bytes at a time and
113	   ping-pong through r10 and r12 to avoid AGEN delays.  */
114	.align	4
115L(big_loop):
116	addi	12,10,32
117	std	4,0(10)
118	std	4,8(10)
119	std	4,16(10)
120	std	4,24(10)
121	bdz	L(tail_bytes)
122
123	addi	10,10,64
124	std	4,0(12)
125	std	4,8(12)
126	std	4,16(12)
127	std	4,24(12)
128	bdnz	L(big_loop)
129
130	mr	12,10
131	b	L(tail_bytes)
132
133	.align	4
134L(tail_bytes):
135
136	/* Check for tail bytes.  */
137	beqlr	cr6
138
139	clrldi	0,5,61
140	mtocrf	0x01,0
141
142	/*  At this point we have a tail of 0-7 bytes and we know that the
143	destination is doubleword-aligned.  */
1444:	/* Copy 4 bytes.  */
145	bf	29,2f
146
147	stw	4,0(12)
148	addi	12,12,4
1492:	/* Copy 2 bytes.  */
150	bf	30,1f
151
152	sth	4,0(12)
153	addi	12,12,2
1541:	/* Copy 1 byte.  */
155	bflr	31
156
157	stb	4,0(12)
158	blr
159
160	/* Special case when value is 0 and we have a long length to deal
161	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
162	   dcbz though, we need to get the destination 128-bytes aligned.  */
163	.align	4
164L(huge):
165	andi.	11,10,127
166	neg	0,10
167	beq	L(huge_aligned)
168
169	clrldi	0,0,57
170	subf	5,0,5
171	srdi	0,0,3
172	mtocrf	0x01,0
173
174	/* Get DST aligned to 128 bytes.  */
1758:	bf	28,4f
176
177	std	4,0(10)
178	std	4,8(10)
179	std	4,16(10)
180	std	4,24(10)
181	std	4,32(10)
182	std	4,40(10)
183	std	4,48(10)
184	std	4,56(10)
185	addi	10,10,64
186	.align	4
1874:	bf	29,2f
188
189	std	4,0(10)
190	std	4,8(10)
191	std	4,16(10)
192	std	4,24(10)
193	addi	10,10,32
194	.align	4
1952:	bf	30,1f
196
197	std	4,0(10)
198	std	4,8(10)
199	addi	10,10,16
200	.align	4
2011:	bf	31,L(huge_aligned)
202
203	std	4,0(10)
204	addi	10,10,8
205
206
207L(huge_aligned):
208	srdi	8,5,7
209	clrldi	11,5,57
210	cmpldi	cr6,11,0
211	mtctr	8
212
213	.align	4
214L(huge_loop):
215	dcbz	0,10
216	addi	10,10,128
217	bdnz	L(huge_loop)
218
219	/* Check how many bytes are still left.  */
220	beqlr	cr6
221
222	subf	9,3,10
223	subf	5,9,12
224	srdi	8,5,3
225	cmpldi	cr6,8,0
226	mtocrf	0x01,8
227
228	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
229	speed.  We'll handle the resulting tail bytes later.  */
230	beq	cr6,L(tail)
231
2328:	bf	28,4f
233
234	std	4,0(10)
235	std	4,8(10)
236	std	4,16(10)
237	std	4,24(10)
238	std	4,32(10)
239	std	4,40(10)
240	std	4,48(10)
241	std	4,56(10)
242	addi	10,10,64
243	.align	4
2444:	bf	29,2f
245
246	std	4,0(10)
247	std	4,8(10)
248	std	4,16(10)
249	std	4,24(10)
250	addi	10,10,32
251	.align	4
2522:	bf	30,1f
253
254	std	4,0(10)
255	std	4,8(10)
256	addi	10,10,16
257	.align	4
2581:	bf	31,L(tail)
259
260	std	4,0(10)
261	addi	10,10,8
262
263	/* Handle the rest of the tail bytes here.  */
264L(tail):
265	mtocrf	0x01,5
266
267	.align	4
2684:	bf	29,2f
269
270	stw	4,0(10)
271	addi	10,10,4
272	.align	4
2732:	bf	30,1f
274
275	sth	4,0(10)
276	addi	10,10,2
277	.align	4
2781:	bflr	31
279
280	stb	4,0(10)
281	blr
282
283	/* Expanded tree to copy tail bytes without increments.  */
284	.align	4
285L(copy_tail):
286	bf	29,L(FXX)
287
288	stw	4,0(10)
289	bf	30,L(TFX)
290
291	sth	4,4(10)
292	bflr	31
293
294	stb	4,6(10)
295	blr
296
297	.align	4
298L(FXX):	bf	30,L(FFX)
299
300	sth	4,0(10)
301	bflr	31
302
303	stb	4,2(10)
304	blr
305
306	.align	4
307L(TFX):	bflr	31
308
309	stb	4,4(10)
310	blr
311
312	.align	4
313L(FFX):	bflr	31
314
315	stb	4,0(10)
316	blr
317
318	/* Handle copies of 9~31 bytes.  */
319	.align	4
320L(medium):
321	/* At least 9 bytes to go.  */
322	andi.	11,10,3
323	clrldi	0,0,62
324	beq	L(medium_aligned)
325
326	/* Force 4-bytes alignment for DST.  */
327	mtocrf	0x01,0
328	subf	5,0,5
3291:	/* Copy 1 byte.  */
330	bf	31,2f
331
332	stb	4,0(10)
333	addi	10,10,1
3342:	/* Copy 2 bytes.  */
335	bf	30,L(medium_aligned)
336
337	sth	4,0(10)
338	addi	10,10,2
339
340	.align	4
341L(medium_aligned):
342	/* At least 6 bytes to go, and DST is word-aligned.  */
343	cmpldi	cr1,5,16
344	mtocrf	0x01,5
345	blt	cr1,8f
346
347	/* Copy 16 bytes.  */
348	stw	4,0(10)
349	stw	4,4(10)
350	stw	4,8(10)
351	stw	4,12(10)
352	addi	10,10,16
3538:	/* Copy 8 bytes.  */
354	bf	28,4f
355
356	stw	4,0(10)
357	stw	4,4(10)
358	addi	10,10,8
3594:	/* Copy 4 bytes.  */
360	bf	29,2f
361
362	stw	4,0(10)
363	addi	10,10,4
3642:	/* Copy 2-3 bytes.  */
365	bf	30,1f
366
367	sth	4,0(10)
368	addi	10,10,2
3691:	/* Copy 1 byte.  */
370	bflr	31
371
372	stb	4,0(10)
373	blr
374
375	/* Handles copies of 0~8 bytes.  */
376	.align	4
377L(small):
378	mtocrf	0x01,5
379	bne	cr6,L(copy_tail)
380
381	stw	4,0(10)
382	stw	4,4(10)
383	blr
384
385END_GEN_TB (MEMSET,TB_TOCLESS)
386libc_hidden_builtin_def (memset)
387