1/* Optimized memset implementation for PowerPC64/POWER8.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.  */
23
24#ifndef MEMSET
25# define MEMSET memset
26#endif
27	.machine  power8
28ENTRY_TOCLESS (MEMSET, 5)
29	CALL_MCOUNT 3
30
31L(_memset):
32	cmpldi	cr7,r5,31
33	neg	r0,r3
34	mr	r10,r3
35
36	insrdi	r4,r4,8,48
37	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
38	ble	cr7,L(write_LT_32)
39
40	andi.	r11,r10,15	/* Check alignment of DST.  */
41	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
42
43	beq	L(big_aligned)
44
45	mtocrf	0x01,r0
46	clrldi	r0,r0,60
47
48	/* Get DST aligned to 16 bytes.  */
491:	bf	31,2f
50	stb	r4,0(r10)
51	addi	r10,r10,1
52
532:	bf	30,4f
54	sth	r4,0(r10)
55	addi	r10,r10,2
56
574:	bf	29,8f
58	stw	r4,0(r10)
59	addi	r10,r10,4
60
618:	bf      28,16f
62	std     r4,0(r10)
63	addi    r10,r10,8
64
6516:	subf	r5,r0,r5
66
67	.align	4
68L(big_aligned):
69	/* For sizes larger than 255 two possible paths:
70	   - if constant is '0', zero full cache lines with dcbz
71	   - otherwise uses vector instructions.  */
72	cmpldi	cr5,r5,255
73	dcbtst	0,r10
74	cmpldi	cr6,r4,0
75	crand	27,26,21
76	bt	27,L(huge_dcbz)
77	bge	cr5,L(huge_vector)
78
79
80	/* Size between 32 and 255 bytes with constant different than 0, use
81	   doubleword store instruction to achieve best throughput.  */
82	srdi    r8,r5,5
83	clrldi  r11,r5,59
84	cmpldi  cr6,r11,0
85	cmpdi	r8,0
86	beq     L(tail_bytes)
87	mtctr   r8
88
89	/* Main aligned write loop, writes 32-bytes at a time.  */
90	.align  4
91L(big_loop):
92	std     r4,0(r10)
93	std     r4,8(r10)
94	std     r4,16(r10)
95	std     r4,24(r10)
96	addi    r10,r10,32
97	bdz     L(tail_bytes)
98
99	std     r4,0(r10)
100	std     r4,8(r10)
101	std     r4,16(r10)
102	std     r4,24(r10)
103	addi    r10,10,32
104	bdnz    L(big_loop)
105
106	b       L(tail_bytes)
107
108	/* Write remaining 1~31 bytes.  */
109	.align  4
110L(tail_bytes):
111	beqlr   cr6
112
113	srdi    r7,r11,4
114	clrldi  r8,r11,60
115	mtocrf  0x01,r7
116
117	.align	4
118	bf	31,8f
119	std	r4,0(r10)
120	std	r4,8(r10)
121	addi	r10,r10,16
122
123	.align	4
1248:	mtocrf	0x1,r8
125	bf	28,4f
126	std	r4,0(r10)
127	addi	r10,r10,8
128
129	.align	4
1304:	bf      29,2f
131	stw     4,0(10)
132	addi    10,10,4
133
134	.align 	4
1352:	bf      30,1f
136	sth     4,0(10)
137	addi    10,10,2
138
139	.align  4
1401:      bflr    31
141	stb     4,0(10)
142	blr
143
144	/* Size larger than 255 bytes with constant different than 0, use
145	   vector instruction to achieve best throughput.  */
146L(huge_vector):
147	/* Replicate set byte to quadword in VMX register.  */
148	mtvsrd	v1,r4
149	xxpermdi 32,v0,v1,0
150	vspltb	 v2,v0,15
151
152	/* Main aligned write loop: 128 bytes at a time.  */
153	li	r6,16
154	li	r7,32
155	li	r8,48
156	mtocrf	0x02,r5
157	srdi	r12,r5,7
158	cmpdi	r12,0
159	beq	L(aligned_tail)
160	mtctr	r12
161	b	L(aligned_128loop)
162
163	.align  4
164L(aligned_128loop):
165	stvx	v2,0,r10
166	stvx	v2,r10,r6
167	stvx	v2,r10,r7
168	stvx	v2,r10,r8
169	addi	r10,r10,64
170	stvx	v2,0,r10
171	stvx	v2,r10,r6
172	stvx	v2,r10,r7
173	stvx	v2,r10,r8
174	addi	r10,r10,64
175	bdnz	L(aligned_128loop)
176
177	/* Write remaining 1~127 bytes.  */
178L(aligned_tail):
179	mtocrf	0x01,r5
180	bf	25,32f
181	stvx	v2,0,r10
182	stvx	v2,r10,r6
183	stvx	v2,r10,r7
184	stvx	v2,r10,r8
185	addi	r10,r10,64
186
18732:	bf	26,16f
188	stvx	v2,0,r10
189	stvx	v2,r10,r6
190	addi	r10,r10,32
191
19216:	bf	27,8f
193	stvx	v2,0,r10
194	addi	r10,r10,16
195
1968:	bf	28,4f
197	std     r4,0(r10)
198	addi	r10,r10,8
199
200	/* Copies 4~7 bytes.  */
2014:	bf	29,L(tail2)
202	stw     r4,0(r10)
203	bf      30,L(tail5)
204	sth     r4,4(r10)
205	bflr	31
206	stb     r4,6(r10)
207	/* Return original DST pointer.  */
208	blr
209
210	/* Special case when value is 0 and we have a long length to deal
211	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
212	   Before using dcbz though, we need to get the destination 128-byte
213	   aligned.  */
214	.align	4
215L(huge_dcbz):
216	andi.	r11,r10,127
217	neg	r0,r10
218	beq	L(huge_dcbz_aligned)
219
220	clrldi	r0,r0,57
221	subf	r5,r0,r5
222	srdi	r0,r0,3
223	mtocrf	0x01,r0
224
225	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
2268:	bf	28,4f
227
228	std	r4,0(r10)
229	std	r4,8(r10)
230	std	r4,16(r10)
231	std	r4,24(r10)
232	std	r4,32(r10)
233	std	r4,40(r10)
234	std	r4,48(r10)
235	std	r4,56(r10)
236	addi	r10,r10,64
237
238	.align	4
2394:	bf	29,2f
240	std	r4,0(r10)
241	std	r4,8(r10)
242	std	r4,16(r10)
243	std	r4,24(r10)
244	addi	r10,r10,32
245
246	.align	4
2472:	bf	30,1f
248	std	r4,0(r10)
249	std	r4,8(r10)
250	addi	r10,r10,16
251
252	.align	4
2531:	bf	31,L(huge_dcbz_aligned)
254	std	r4,0(r10)
255	addi	r10,r10,8
256
257L(huge_dcbz_aligned):
258	/* Setup dcbz unroll offsets and count numbers.  */
259	srdi	r8,r5,9
260	clrldi	r11,r5,55
261	cmpldi	cr6,r11,0
262	li	r9,128
263	cmpdi	r8,0
264	beq     L(huge_tail)
265	li	r7,256
266	li	r6,384
267	mtctr	r8
268
269	.align	4
270L(huge_loop):
271	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
272	   a throughput boost for large sizes (2048 bytes or higher).  */
273	dcbz	0,r10
274	dcbz	r9,r10
275	dcbz	r7,r10
276	dcbz	r6,r10
277	addi	r10,r10,512
278	bdnz	L(huge_loop)
279
280	beqlr	cr6
281
282L(huge_tail):
283	srdi    r6,r11,8
284	srdi    r7,r11,4
285	clrldi  r8,r11,4
286	cmpldi  cr6,r8,0
287	mtocrf  0x01,r6
288
289	beq	cr6,L(tail)
290
291	/* We have 1~511 bytes remaining.  */
292	.align	4
29332:	bf	31,16f
294	dcbz	0,r10
295	dcbz	r9,r10
296	addi	r10,r10,256
297
298	.align	4
29916:	mtocrf  0x01,r7
300	bf	28,8f
301	dcbz	0,r10
302	addi	r10,r10,128
303
304	.align 	4
3058:	bf	29,4f
306	std	r4,0(r10)
307	std	r4,8(r10)
308	std	r4,16(r10)
309	std	r4,24(r10)
310	std	r4,32(r10)
311	std	r4,40(r10)
312	std	r4,48(r10)
313	std	r4,56(r10)
314	addi	r10,r10,64
315
316	.align	4
3174:	bf	30,2f
318	std	r4,0(r10)
319	std	r4,8(r10)
320	std	r4,16(r10)
321	std	r4,24(r10)
322	addi	r10,r10,32
323
324	.align	4
3252:	bf	31,L(tail)
326	std	r4,0(r10)
327	std	r4,8(r10)
328	addi	r10,r10,16
329	.align	4
330
331	/* Remaining 1~15 bytes.  */
332L(tail):
333	mtocrf  0x01,r8
334
335	.align
3368:	bf	28,4f
337	std	r4,0(r10)
338	addi	r10,r10,8
339
340	.align	4
3414:	bf	29,2f
342	stw	r4,0(r10)
343	addi	r10,r10,4
344
345	.align	4
3462:	bf	30,1f
347	sth	r4,0(r10)
348	addi	r10,r10,2
349
350	.align	4
3511:	bflr	31
352	stb	r4,0(r10)
353	blr
354
355	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
356	   by just unrolling all operations.  */
357	.align	4
358L(write_LT_32):
359	cmpldi	cr6,5,8
360	mtocrf	0x01,r5
361	ble	cr6,L(write_LE_8)
362
363	/* At least 9 bytes to go.  */
364	neg	r8,r4
365	andi.	r0,r8,3
366	cmpldi	cr1,r5,16
367	beq	L(write_LT_32_aligned)
368
369	/* Force 4-byte alignment for SRC.  */
370	mtocrf	0x01,r0
371	subf	r5,r0,r5
372
3732:	bf	30,1f
374	/* Use stb instead of sth because it doesn't generate
375	   alignment interrupts on cache-inhibited storage.  */
376	stb	r4,0(r10)
377	stb	r4,1(r10)
378	addi	r10,r10,2
379
3801:	bf	31,L(end_4bytes_alignment)
381	stb	r4,0(r10)
382	addi	r10,r10,1
383
384	.align	4
385L(end_4bytes_alignment):
386	cmpldi	cr1,r5,16
387	mtocrf	0x01,r5
388
389L(write_LT_32_aligned):
390	blt	cr1,8f
391
392	stw	r4,0(r10)
393	stw	r4,4(r10)
394	stw	r4,8(r10)
395	stw	r4,12(r10)
396	addi	r10,r10,16
397
3988:	bf	28,L(tail4)
399	stw	r4,0(r10)
400	stw	r4,4(r10)
401	addi	r10,r10,8
402
403	.align	4
404	/* Copies 4~7 bytes.  */
405L(tail4):
406	bf	29,L(tail2)
407	stw	r4,0(r10)
408	bf	30,L(tail5)
409	sth	r4,4(r10)
410	bflr	31
411	stb	r4,6(r10)
412	blr
413
414	.align	4
415	/* Copies 2~3 bytes.  */
416L(tail2):
417	bf	30,1f
418	sth	r4,0(r10)
419	bflr	31
420	stb	r4,2(r10)
421	blr
422
423	.align	4
424L(tail5):
425	bflr	31
426	stb	r4,4(r10)
427	blr
428
429	.align	4
4301: 	bflr	31
431	stb	r4,0(r10)
432	blr
433
434	/* Handles copies of 0~8 bytes.  */
435	.align	4
436L(write_LE_8):
437	bne	cr6,L(LE7_tail4)
438	/* If input is word aligned, use stw, else use stb.  */
439	andi.	r0,r10,3
440	bne	L(8_unalign)
441
442	stw	r4,0(r10)
443	stw	r4,4(r10)
444	blr
445
446	/* Unaligned input and size is 8.  */
447	.align	4
448L(8_unalign):
449	andi.	r0,r10,1
450	beq	L(8_hwalign)
451	stb	r4,0(r10)
452	sth	r4,1(r10)
453	sth	r4,3(r10)
454	sth	r4,5(r10)
455	stb	r4,7(r10)
456	blr
457
458	/* Halfword aligned input and size is 8.  */
459	.align	4
460L(8_hwalign):
461	sth	r4,0(r10)
462	sth	r4,2(r10)
463	sth	r4,4(r10)
464	sth	r4,6(r10)
465	blr
466
467	.align	4
468	/* Copies 4~7 bytes.  */
469L(LE7_tail4):
470	/* Use stb instead of sth because it doesn't generate
471	   alignment interrupts on cache-inhibited storage.  */
472	bf	29,L(LE7_tail2)
473	stb	r4,0(r10)
474	stb	r4,1(r10)
475	stb	r4,2(r10)
476	stb	r4,3(r10)
477	bf	30,L(LE7_tail5)
478	stb	r4,4(r10)
479	stb	r4,5(r10)
480	bflr	31
481	stb	r4,6(r10)
482	blr
483
484	.align	4
485	/* Copies 2~3 bytes.  */
486L(LE7_tail2):
487	bf	30,1f
488	stb	r4,0(r10)
489	stb	r4,1(r10)
490	bflr	31
491	stb	r4,2(r10)
492	blr
493
494	.align	4
495L(LE7_tail5):
496	bflr	31
497	stb	r4,4(r10)
498	blr
499
500	.align	4
5011: 	bflr	31
502	stb	r4,0(r10)
503	blr
504
505END_GEN_TB (MEMSET,TB_TOCLESS)
506libc_hidden_builtin_def (memset)
507