1/* Optimized 64-bit memset implementation for POWER6.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.
23
24   The memset is done in three sizes: byte (8 bits), word (32 bits),
25   cache line (256 bits). There is a special case for setting cache lines
26   to 0, to take advantage of the dcbz instruction.  */
27
28#ifndef MEMSET
29# define MEMSET memset
30#endif
31	.machine power6
32ENTRY_TOCLESS (MEMSET, 7)
33	CALL_MCOUNT 3
34
35#define rTMP	r0
36#define rRTN	r3	/* Initial value of 1st argument.  */
37#define rMEMP0	r3	/* Original value of 1st arg.  */
38#define rCHR	r4	/* Char to set in each byte.  */
39#define rLEN	r5	/* Length of region to set.  */
40#define rMEMP	r6	/* Address at which we are storing.  */
41#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
42#define rMEMP2	r8
43#define rMEMP3	r9	/* Alt mem pointer.  */
44L(_memset):
45/* Take care of case for size <= 4.  */
46	cmpldi	cr1, rLEN, 8
47	andi.	rALIGN, rMEMP0, 7
48	mr	rMEMP, rMEMP0
49	ble	cr1, L(small)
50
51/* Align to doubleword boundary.  */
52	cmpldi	cr5, rLEN, 31
53	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
54	beq+	L(aligned2)
55	mtcrf	0x01, rMEMP0
56	subfic	rALIGN, rALIGN, 8
57	cror	28,30,31		/* Detect odd word aligned.  */
58	add	rMEMP, rMEMP, rALIGN
59	sub	rLEN, rLEN, rALIGN
60	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
61	bt	29, L(g4)
62/* Process the even word of doubleword.  */
63	bf+	31, L(g2)
64	stb	rCHR, 0(rMEMP0)
65	bt	30, L(g4x)
66L(g2):
67	sth	rCHR, -6(rMEMP)
68L(g4x):
69	stw	rCHR, -4(rMEMP)
70	b	L(aligned)
71/* Process the odd word of doubleword.  */
72L(g4):
73	bf	28, L(g4x) /* If false, word aligned on odd word.  */
74	bf+	31, L(g0)
75	stb	rCHR, 0(rMEMP0)
76	bt	30, L(aligned)
77L(g0):
78	sth	rCHR, -2(rMEMP)
79
80/* Handle the case of size < 31.  */
81L(aligned2):
82	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
83L(aligned):
84	mtcrf	0x01, rLEN
85	ble	cr5, L(medium)
86/* Align to 32-byte boundary.  */
87	andi.	rALIGN, rMEMP, 0x18
88	subfic	rALIGN, rALIGN, 0x20
89	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
90	beq	L(caligned)
91	mtcrf	0x01, rALIGN
92	add	rMEMP, rMEMP, rALIGN
93	sub	rLEN, rLEN, rALIGN
94	cmplwi	cr1, rALIGN, 0x10
95	mr	rMEMP2, rMEMP
96	bf	28, L(a1)
97	stdu	rCHR, -8(rMEMP2)
98L(a1):	blt	cr1, L(a2)
99	std	rCHR, -8(rMEMP2)
100	stdu	rCHR, -16(rMEMP2)
101L(a2):
102
103/* Now aligned to a 32 byte boundary.  */
104        .align 4
105L(caligned):
106	cmpldi	cr1, rCHR, 0
107	clrrdi.	rALIGN, rLEN, 5
108	mtcrf	0x01, rLEN
109	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
110	beq	L(medium)	/* We may not actually get to do a full line.  */
111	.align 4
112/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
113   boundary may not be at cache line (128-byte) boundary.  */
114L(nzloopstart):
115/* memset in 32-byte chunks until we get to a cache line boundary.
116   If rLEN is less than the distance to the next cache-line boundary use
117   cacheAligned1 code to finish the tail.  */
118	cmpldi	cr1,rLEN,128
119
120	andi.	rTMP,rMEMP,127
121	blt	cr1,L(cacheAligned1)
122	addi	rMEMP3,rMEMP,32
123	beq	L(nzCacheAligned)
124	addi	rLEN,rLEN,-32
125	std	rCHR,0(rMEMP)
126	std	rCHR,8(rMEMP)
127	std	rCHR,16(rMEMP)
128	addi	rMEMP,rMEMP,32
129	andi.	rTMP,rMEMP3,127
130	std	rCHR,-8(rMEMP3)
131
132	beq	L(nzCacheAligned)
133	addi	rLEN,rLEN,-32
134	std	rCHR,0(rMEMP3)
135	addi	rMEMP,rMEMP,32
136	std	rCHR,8(rMEMP3)
137	andi.	rTMP,rMEMP,127
138	std	rCHR,16(rMEMP3)
139	std	rCHR,24(rMEMP3)
140
141	beq	L(nzCacheAligned)
142	addi	rLEN,rLEN,-32
143	std	rCHR,32(rMEMP3)
144	addi	rMEMP,rMEMP,32
145	cmpldi	cr1,rLEN,128
146	std	rCHR,40(rMEMP3)
147	cmpldi	cr6,rLEN,256
148	li	rMEMP2,128
149	std	rCHR,48(rMEMP3)
150	std	rCHR,56(rMEMP3)
151	blt	cr1,L(cacheAligned1)
152	b	L(nzCacheAligned128)
153
154/* Now we are aligned to the cache line and can use dcbtst.  */
155        .align 4
156L(nzCacheAligned):
157	cmpldi	cr1,rLEN,128
158	blt	cr1,L(cacheAligned1)
159	b	L(nzCacheAligned128)
160        .align 5
161L(nzCacheAligned128):
162	cmpldi	cr1,rLEN,256
163	addi	rMEMP3,rMEMP,64
164	std	rCHR,0(rMEMP)
165	std	rCHR,8(rMEMP)
166	std	rCHR,16(rMEMP)
167	std	rCHR,24(rMEMP)
168	std	rCHR,32(rMEMP)
169	std	rCHR,40(rMEMP)
170	std	rCHR,48(rMEMP)
171	std	rCHR,56(rMEMP)
172	addi	rMEMP,rMEMP3,64
173	addi	rLEN,rLEN,-128
174	std	rCHR,0(rMEMP3)
175	std	rCHR,8(rMEMP3)
176	std	rCHR,16(rMEMP3)
177	std	rCHR,24(rMEMP3)
178	std	rCHR,32(rMEMP3)
179	std	rCHR,40(rMEMP3)
180	std	rCHR,48(rMEMP3)
181	std	rCHR,56(rMEMP3)
182	bge	cr1,L(nzCacheAligned128)
183	dcbtst	0,rMEMP
184	b	L(cacheAligned1)
185	.align 5
186/* Storing a zero "c" value. We are aligned at a sector (32-byte)
187   boundary but may not be at cache line (128-byte) boundary.  If the
188   remaining length spans a full cache line we can use the Data cache
189   block zero instruction. */
190L(zloopstart):
191/* memset in 32-byte chunks until we get to a cache line boundary.
192   If rLEN is less than the distance to the next cache-line boundary use
193   cacheAligned1 code to finish the tail.  */
194	cmpldi	cr1,rLEN,128
195	beq	L(medium)
196L(getCacheAligned):
197	andi.	rTMP,rMEMP,127
198	nop
199	blt	cr1,L(cacheAligned1)
200	addi	rMEMP3,rMEMP,32
201	beq	L(cacheAligned)
202	addi	rLEN,rLEN,-32
203	std	rCHR,0(rMEMP)
204	std	rCHR,8(rMEMP)
205	std	rCHR,16(rMEMP)
206	addi	rMEMP,rMEMP,32
207	andi.	rTMP,rMEMP3,127
208	std	rCHR,-8(rMEMP3)
209L(getCacheAligned2):
210	beq	L(cacheAligned)
211	addi	rLEN,rLEN,-32
212	std	rCHR,0(rMEMP3)
213	std	rCHR,8(rMEMP3)
214	addi	rMEMP,rMEMP,32
215	andi.	rTMP,rMEMP,127
216	std	rCHR,16(rMEMP3)
217	std	rCHR,24(rMEMP3)
218L(getCacheAligned3):
219	beq	L(cacheAligned)
220	addi	rLEN,rLEN,-32
221	std	rCHR,32(rMEMP3)
222	addi	rMEMP,rMEMP,32
223	cmpldi	cr1,rLEN,128
224	std	rCHR,40(rMEMP3)
225	cmpldi	cr6,rLEN,256
226	li	rMEMP2,128
227	std	rCHR,48(rMEMP3)
228	std	rCHR,56(rMEMP3)
229	blt	cr1,L(cacheAligned1)
230	blt	cr6,L(cacheAligned128)
231	b	L(cacheAlignedx)
232
233/* Now we are aligned to the cache line and can use dcbz.  */
234        .align 5
235L(cacheAligned):
236	cmpldi	cr1,rLEN,128
237	cmpldi	cr6,rLEN,256
238	blt	cr1,L(cacheAligned1)
239	li	rMEMP2,128
240L(cacheAlignedx):
241	cmpldi	cr5,rLEN,640
242	blt	cr6,L(cacheAligned128)
243	bgt	cr5,L(cacheAligned512)
244	cmpldi	cr6,rLEN,512
245	dcbz	0,rMEMP
246	cmpldi	cr1,rLEN,384
247	dcbz	rMEMP2,rMEMP
248	addi	rMEMP,rMEMP,256
249	addi	rLEN,rLEN,-256
250	blt	cr1,L(cacheAligned1)
251	blt	cr6,L(cacheAligned128)
252	b	L(cacheAligned256)
253	.align 5
254/* A simple loop for the longer (>640 bytes) lengths.  This form limits
255   the branch miss-predicted to exactly 1 at loop exit.*/
256L(cacheAligned512):
257	cmpldi	cr1,rLEN,128
258	blt	cr1,L(cacheAligned1)
259	dcbz	0,rMEMP
260	addi	rLEN,rLEN,-128
261	addi	rMEMP,rMEMP,128
262	b	L(cacheAligned512)
263        .align 5
264L(cacheAligned256):
265
266	cmpldi	cr6,rLEN,512
267
268	dcbz	0,rMEMP
269	cmpldi	cr1,rLEN,384
270	dcbz	rMEMP2,rMEMP
271	addi	rMEMP,rMEMP,256
272	addi	rLEN,rLEN,-256
273
274	bge	cr6,L(cacheAligned256)
275
276	blt	cr1,L(cacheAligned1)
277        .align 4
278L(cacheAligned128):
279	dcbz	0,rMEMP
280	addi	rMEMP,rMEMP,128
281	addi	rLEN,rLEN,-128
282        nop
283L(cacheAligned1):
284	cmpldi	cr1,rLEN,32
285	blt	cr1,L(handletail32)
286	addi	rMEMP3,rMEMP,32
287	addi	rLEN,rLEN,-32
288	std	rCHR,0(rMEMP)
289	std	rCHR,8(rMEMP)
290	std	rCHR,16(rMEMP)
291	addi	rMEMP,rMEMP,32
292	cmpldi	cr1,rLEN,32
293	std	rCHR,-8(rMEMP3)
294L(cacheAligned2):
295	blt	cr1,L(handletail32)
296	addi	rLEN,rLEN,-32
297	std	rCHR,0(rMEMP3)
298	std	rCHR,8(rMEMP3)
299	addi	rMEMP,rMEMP,32
300	cmpldi	cr1,rLEN,32
301	std	rCHR,16(rMEMP3)
302	std	rCHR,24(rMEMP3)
303	nop
304L(cacheAligned3):
305	blt	cr1,L(handletail32)
306	addi	rMEMP,rMEMP,32
307	addi	rLEN,rLEN,-32
308	std	rCHR,32(rMEMP3)
309	std	rCHR,40(rMEMP3)
310	std	rCHR,48(rMEMP3)
311	std	rCHR,56(rMEMP3)
312
313/* We are here because the length or remainder (rLEN) is less than the
314   cache line/sector size and does not justify aggressive loop unrolling.
315   So set up the preconditions for L(medium) and go there.  */
316        .align 3
317L(handletail32):
318	cmpldi	cr1,rLEN,0
319	beqlr   cr1
320	b	L(medium)
321
322	.align 5
323L(small):
324/* Memset of 8 bytes or less.  */
325	cmpldi	cr6, rLEN, 4
326	cmpldi	cr5, rLEN, 1
327	ble	cr6,L(le4)
328	subi	rLEN, rLEN, 4
329	stb	rCHR,0(rMEMP)
330	stb	rCHR,1(rMEMP)
331	stb	rCHR,2(rMEMP)
332	stb	rCHR,3(rMEMP)
333	addi	rMEMP,rMEMP, 4
334	cmpldi	cr5, rLEN, 1
335L(le4):
336	cmpldi	cr1, rLEN, 3
337	bltlr	cr5
338	stb	rCHR, 0(rMEMP)
339	beqlr	cr5
340	stb	rCHR, 1(rMEMP)
341	bltlr	cr1
342	stb	rCHR, 2(rMEMP)
343	beqlr	cr1
344	stb	rCHR, 3(rMEMP)
345	blr
346
347/* Memset of 0-31 bytes.  */
348	.align 5
349L(medium):
350	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
351	cmpldi	cr1, rLEN, 16
352L(medium_tail2):
353	add	rMEMP, rMEMP, rLEN
354L(medium_tail):
355	bt-	31, L(medium_31t)
356	bt-	30, L(medium_30t)
357L(medium_30f):
358	bt	29, L(medium_29t)
359L(medium_29f):
360	bge	cr1, L(medium_27t)
361	bflr	28
362	std	rCHR, -8(rMEMP)
363	blr
364
365L(medium_31t):
366	stbu	rCHR, -1(rMEMP)
367	bf-	30, L(medium_30f)
368L(medium_30t):
369	sthu	rCHR, -2(rMEMP)
370	bf-	29, L(medium_29f)
371L(medium_29t):
372	stwu	rCHR, -4(rMEMP)
373	blt	cr1, L(medium_27f)
374L(medium_27t):
375	std	rCHR, -8(rMEMP)
376	stdu	rCHR, -16(rMEMP)
377L(medium_27f):
378	bflr	28
379L(medium_28t):
380	std	rCHR, -8(rMEMP)
381	blr
382END_GEN_TB (MEMSET,TB_TOCLESS)
383libc_hidden_builtin_def (memset)
384