1/* Optimized memset implementation for PowerPC32/POWER7.
2   Copyright (C) 2010-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.  */
23
24	.machine  power7
25EALIGN (memset, 5, 0)
26	CALL_MCOUNT
27
28	.align	4
29L(_memset):
30	cmplwi	cr7,5,31
31	cmplwi	cr6,5,8
32	mr	10,3		/* Save original argument for later.  */
33	mr	7,1		/* Save original r1 for later.  */
34	cfi_offset(31,-8)
35
36	/* Replicate byte to word.  */
37	insrwi	4,4,8,16
38	insrwi	4,4,16,0
39
40	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
41
42	neg	0,3
43	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
44
45	/* Save our word twice to create a doubleword that we will later
46	   copy to a FPR.  */
47	stwu	1,-32(1)
48	andi.	11,10,7		/* Check alignment of DST.  */
49	mr	12,5
50	stw	4,24(1)
51	stw	4,28(1)
52	beq	L(big_aligned)
53
54	clrlwi	0,0,29
55	mtocrf	0x01,0
56	subf	5,0,5
57
58	/* Get DST aligned to 8 bytes.  */
591:	bf	31,2f
60
61	stb	4,0(10)
62	addi	10,10,1
632:	bf	30,4f
64
65	sth	4,0(10)
66	addi	10,10,2
674:	bf	29,L(big_aligned)
68
69	stw	4,0(10)
70	addi	10,10,4
71
72	.align	4
73L(big_aligned):
74	cmplwi	cr5,5,255
75	li	0,32
76	cmplwi	cr1,5,160
77	dcbtst	0,10
78	cmplwi	cr6,4,0
79	srwi	9,5,3		/* Number of full doublewords remaining.  */
80	crand	27,26,21
81	mtocrf	0x01,9
82	bt	27,L(huge)
83
84	/* From this point on, we'll copy 32+ bytes and the value
85	   isn't 0 (so we can't use dcbz).  */
86
87	srwi	8,5,5
88	clrlwi	11,5,29
89	cmplwi	cr6,11,0
90	cmplwi	cr1,9,4
91	mtctr	8
92
93	/* Copy 1~3 doublewords so the main loop starts
94	at a multiple of 32 bytes.  */
95
96	bf	30,1f
97
98	stw	4,0(10)
99	stw	4,4(10)
100	stw	4,8(10)
101	stw	4,12(10)
102	addi	10,10,16
103	bf	31,L(big_loop)
104
105	stw	4,0(10)
106	stw	4,4(10)
107	addi	10,10,8
108	mr	12,10
109	blt	cr1,L(tail_bytes)
110
111	b	L(big_loop)
112
113	.align	4
1141:	/* Copy 1 doubleword.  */
115	bf	31,L(big_loop)
116
117	stw	4,0(10)
118	stw	4,4(10)
119	addi	10,10,8
120
121	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
122	   to the lfd we will do next.  Also, ping-pong through r10 and r12
123	   to avoid AGEN delays.  */
124	.align	4
125L(big_loop):
126	addi	12,10,32
127	stw	4,0(10)
128	stw	4,4(10)
129	stw	4,8(10)
130	stw	4,12(10)
131	stw	4,16(10)
132	stw	4,20(10)
133	stw	4,24(10)
134	stw	4,28(10)
135	bdz	L(tail_bytes)
136
137	addi	10,10,64
138	stw	4,0(12)
139	stw	4,4(12)
140	stw	4,8(12)
141	stw	4,12(12)
142	stw	4,16(12)
143	stw	4,20(12)
144	stw	4,24(12)
145	stw	4,28(12)
146	bdnz	L(big_loop_fast_setup)
147
148	mr	12,10
149	b	L(tail_bytes)
150
151	/* Now that we're probably past the LHS window, use the VSX to
152	   speed up the loop.  */
153L(big_loop_fast_setup):
154	li	11,24
155	li	6,16
156	lxvdsx	4,1,11
157
158	.align	4
159L(big_loop_fast):
160	addi	12,10,32
161	stxvd2x	4,0,10
162	stxvd2x	4,10,6
163	bdz	L(tail_bytes)
164
165	addi	10,10,64
166	stxvd2x	4,0,12
167	stxvd2x	4,12,6
168	bdnz	L(big_loop_fast)
169
170	mr	12,10
171
172	.align	4
173L(tail_bytes):
174
175	/* Check for tail bytes.  */
176	mr	1,7		/* Restore r1.  */
177	beqlr	cr6
178
179	clrlwi	0,5,29
180	mtocrf	0x01,0
181
182	/*  At this point we have a tail of 0-7 bytes and we know that the
183	destination is doubleword-aligned.  */
1844:	/* Copy 4 bytes.  */
185	bf	29,2f
186
187	stw	4,0(12)
188	addi	12,12,4
1892:	/* Copy 2 bytes.  */
190	bf	30,1f
191
192	sth	4,0(12)
193	addi	12,12,2
1941:	/* Copy 1 byte.  */
195	bflr	31
196
197	stb	4,0(12)
198	blr
199
200
201	/* Special case when value is 0 and we have a long length to deal
202	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
203	   dcbz though, we need to get the destination 128-bytes aligned.  */
204	.align	4
205L(huge):
206	lfd	4,24(1)
207	andi.	11,10,127
208	neg	0,10
209	beq	L(huge_aligned)
210
211	clrlwi	0,0,25
212	subf	5,0,5
213	srwi	0,0,3
214	mtocrf  0x01,0
215
216	/* Get DST aligned to 128 bytes.  */
2178:	bf	28,4f
218
219	stfd	4,0(10)
220	stfd	4,8(10)
221	stfd	4,16(10)
222	stfd	4,24(10)
223	stfd	4,32(10)
224	stfd	4,40(10)
225	stfd	4,48(10)
226	stfd	4,56(10)
227	addi	10,10,64
228	.align	4
2294:	bf	29,2f
230
231	stfd	4,0(10)
232	stfd	4,8(10)
233	stfd	4,16(10)
234	stfd	4,24(10)
235	addi	10,10,32
236	.align	4
2372:	bf	30,1f
238
239	stfd	4,0(10)
240	stfd	4,8(10)
241	addi	10,10,16
242	.align	4
2431:	bf	31,L(huge_aligned)
244
245	stfd	4,0(10)
246	addi	10,10,8
247
248L(huge_aligned):
249	srwi	8,5,7
250	clrlwi	11,5,25
251	cmplwi	cr6,11,0
252	mtctr	8
253
254	/* Copies 128-bytes at a time.  */
255	.align	4
256L(huge_loop):
257	dcbz	0,10
258	addi	10,10,128
259	bdnz	L(huge_loop)
260
261	/* We have a tail of 0~127 bytes to handle.  */
262	mr	1,7		/* Restore r1.  */
263	beqlr	cr6
264
265	subf	9,3,10
266	subf	5,9,12
267	srwi	8,5,3
268	cmplwi	cr6,8,0
269	mtocrf	0x01,8
270
271	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
272	speed.  We'll handle the resulting tail bytes later.  */
273	beq	cr6,L(tail)
274
2758:	bf	28,4f
276
277	stfd	4,0(10)
278	stfd	4,8(10)
279	stfd	4,16(10)
280	stfd	4,24(10)
281	stfd	4,32(10)
282	stfd	4,40(10)
283	stfd	4,48(10)
284	stfd	4,56(10)
285	addi	10,10,64
286	.align	4
2874:	bf	29,2f
288
289	stfd	4,0(10)
290	stfd	4,8(10)
291	stfd	4,16(10)
292	stfd	4,24(10)
293	addi	10,10,32
294	.align	4
2952:	bf	30,1f
296
297	stfd	4,0(10)
298	stfd	4,8(10)
299	addi	10,10,16
300	.align	4
3011:	bf	31,L(tail)
302
303	stfd	4,0(10)
304	addi	10,10,8
305
306	/* Handle the rest of the tail bytes here.  */
307L(tail):
308	mtocrf	0x01,5
309
310	.align	4
3114:	bf	29,2f
312
313	stw	4,0(10)
314	addi	10,10,4
315	.align	4
3162:	bf	30,1f
317
318	sth	4,0(10)
319	addi	10,10,2
320	.align	4
3211:	bflr	31
322
323	stb	4,0(10)
324	blr
325
326
327	/* Expanded tree to copy tail bytes without increments.  */
328	.align	4
329L(copy_tail):
330	bf	29,L(FXX)
331
332	stw	4,0(10)
333	bf	30,L(TFX)
334
335	sth	4,4(10)
336	bflr	31
337
338	stb	4,6(10)
339	blr
340
341	.align	4
342L(FXX):	bf	30,L(FFX)
343
344	sth	4,0(10)
345	bflr	31
346
347	stb	4,2(10)
348	blr
349
350	.align	4
351L(TFX):	bflr	31
352
353	stb	4,4(10)
354	blr
355
356	.align	4
357L(FFX):	bflr	31
358
359	stb	4,0(10)
360	blr
361
362	/* Handle copies of 9~31 bytes.  */
363	.align	4
364L(medium):
365	/* At least 9 bytes to go.  */
366	andi.	11,10,3
367	clrlwi	0,0,30
368	beq	L(medium_aligned)
369
370	/* Force 4-bytes alignment for DST.  */
371	mtocrf	0x01,0
372	subf	5,0,5
3731:	/* Copy 1 byte.  */
374	bf	31,2f
375
376	stb	4,0(10)
377	addi	10,10,1
3782:	/* Copy 2 bytes.  */
379	bf	30,L(medium_aligned)
380
381	sth	4,0(10)
382	addi	10,10,2
383
384	.align	4
385L(medium_aligned):
386	/* At least 6 bytes to go, and DST is word-aligned.  */
387	cmplwi	cr1,5,16
388	mtocrf	0x01,5
389	blt	cr1,8f
390
391	/* Copy 16 bytes.  */
392	stw	4,0(10)
393	stw	4,4(10)
394	stw	4,8(10)
395	stw	4,12(10)
396	addi	10,10,16
3978:	/* Copy 8 bytes.  */
398	bf	28,4f
399
400	stw	4,0(10)
401	stw	4,4(10)
402	addi	10,10,8
4034:	/* Copy 4 bytes.  */
404	bf	29,2f
405
406	stw	4,0(10)
407	addi	10,10,4
4082:	/* Copy 2-3 bytes.  */
409	bf	30,1f
410
411	sth	4,0(10)
412	addi	10,10,2
4131:	/* Copy 1 byte.  */
414	bflr	31
415
416	stb	4,0(10)
417	blr
418
419	/* Handles copies of 0~8 bytes.  */
420	.align	4
421L(small):
422	mtocrf	0x01,5
423	bne	cr6,L(copy_tail)
424
425	stw	4,0(10)
426	stw	4,4(10)
427	blr
428
429END (memset)
430libc_hidden_builtin_def (memset)
431