1/* Optimized memset implementation for PowerPC64.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.
23
24   The memset is done in three sizes: byte (8 bits), word (32 bits),
25   cache line (256 bits). There is a special case for setting cache lines
26   to 0, to take advantage of the dcbz instruction.  */
27
28#ifndef MEMSET
29# define MEMSET memset
30#endif
31	.machine power4
32ENTRY_TOCLESS (MEMSET, 5)
33	CALL_MCOUNT 3
34
35#define rTMP	r0
36#define rRTN	r3	/* Initial value of 1st argument.  */
37#define rMEMP0	r3	/* Original value of 1st arg.  */
38#define rCHR	r4	/* Char to set in each byte.  */
39#define rLEN	r5	/* Length of region to set.  */
40#define rMEMP	r6	/* Address at which we are storing.  */
41#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
42#define rMEMP2	r8
43
44#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
45#define rCLS	r8	/* Cache line size obtained from static.  */
46#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
47L(_memset):
48/* Take care of case for size <= 4.  */
49	cmpldi	cr1, rLEN, 8
50	andi.	rALIGN, rMEMP0, 7
51	mr	rMEMP, rMEMP0
52	ble-	cr1, L(small)
53
54/* Align to doubleword boundary.  */
55	cmpldi	cr5, rLEN, 31
56	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
57	beq+	L(aligned2)
58	mtcrf	0x01, rMEMP0
59	subfic	rALIGN, rALIGN, 8
60	cror	28,30,31		/* Detect odd word aligned.  */
61	add	rMEMP, rMEMP, rALIGN
62	sub	rLEN, rLEN, rALIGN
63	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
64	bt	29, L(g4)
65/* Process the even word of doubleword.  */
66	bf+	31, L(g2)
67	stb	rCHR, 0(rMEMP0)
68	bt	30, L(g4x)
69L(g2):
70	sth	rCHR, -6(rMEMP)
71L(g4x):
72	stw	rCHR, -4(rMEMP)
73	b	L(aligned)
74/* Process the odd word of doubleword.  */
75L(g4):
76	bf	28, L(g4x) /* If false, word aligned on odd word.  */
77	bf+	31, L(g0)
78	stb	rCHR, 0(rMEMP0)
79	bt	30, L(aligned)
80L(g0):
81	sth	rCHR, -2(rMEMP)
82
83/* Handle the case of size < 31.  */
84L(aligned2):
85	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
86L(aligned):
87	mtcrf	0x01, rLEN
88	ble	cr5, L(medium)
89/* Align to 32-byte boundary.  */
90	andi.	rALIGN, rMEMP, 0x18
91	subfic	rALIGN, rALIGN, 0x20
92	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
93	beq	L(caligned)
94	mtcrf	0x01, rALIGN
95	add	rMEMP, rMEMP, rALIGN
96	sub	rLEN, rLEN, rALIGN
97	cmplwi	cr1, rALIGN, 0x10
98	mr	rMEMP2, rMEMP
99	bf	28, L(a1)
100	stdu	rCHR, -8(rMEMP2)
101L(a1):	blt	cr1, L(a2)
102	std	rCHR, -8(rMEMP2)
103	stdu	rCHR, -16(rMEMP2)
104L(a2):
105
106/* Now aligned to a 32 byte boundary.  */
107L(caligned):
108	cmpldi	cr1, rCHR, 0
109	clrrdi.	rALIGN, rLEN, 5
110	mtcrf	0x01, rLEN
111	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
112L(nondcbz):
113	srdi	rTMP, rALIGN, 5
114	mtctr	rTMP
115	beq	L(medium)	/* We may not actually get to do a full line.  */
116	clrldi.	rLEN, rLEN, 59
117	add	rMEMP, rMEMP, rALIGN
118	li	rNEG64, -0x40
119	bdz	L(cloopdone)
120
121L(c3):	dcbtst	rNEG64, rMEMP
122	std	rCHR, -8(rMEMP)
123	std	rCHR, -16(rMEMP)
124	std	rCHR, -24(rMEMP)
125	stdu	rCHR, -32(rMEMP)
126	bdnz	L(c3)
127L(cloopdone):
128	std	rCHR, -8(rMEMP)
129	std	rCHR, -16(rMEMP)
130	cmpldi	cr1, rLEN, 16
131	std	rCHR, -24(rMEMP)
132	stdu	rCHR, -32(rMEMP)
133	beqlr
134	add	rMEMP, rMEMP, rALIGN
135	b	L(medium_tail2)
136
137	.align 5
138/* Clear lines of memory in 128-byte chunks.  */
139L(zloopstart):
140/* If the remaining length is less the 32 bytes, don't bother getting
141	 the cache line size.  */
142	beq	L(medium)
143	li      rCLS,128  /* cache line size is 128 */
144
145/* Now we know the cache line size, and it is not 32-bytes, but
146	 we may not yet be aligned to the cache line. May have a partial
147	 line to fill, so touch it 1st.  */
148	dcbt	0,rMEMP
149L(getCacheAligned):
150	cmpldi	cr1,rLEN,32
151	andi.	rTMP,rMEMP,127
152	blt	cr1,L(handletail32)
153	beq	L(cacheAligned)
154	addi	rMEMP,rMEMP,32
155	addi	rLEN,rLEN,-32
156	std	rCHR,-32(rMEMP)
157	std	rCHR,-24(rMEMP)
158	std	rCHR,-16(rMEMP)
159	std	rCHR,-8(rMEMP)
160	b	L(getCacheAligned)
161
162/* Now we are aligned to the cache line and can use dcbz.  */
163L(cacheAligned):
164	cmpld	cr1,rLEN,rCLS
165	blt	cr1,L(handletail32)
166	dcbz	0,rMEMP
167	subf	rLEN,rCLS,rLEN
168	add	rMEMP,rMEMP,rCLS
169	b	L(cacheAligned)
170
171/* We are here because the cache line size was set and was not 32-bytes
172   and the remainder (rLEN) is less than the actual cache line size.
173   So set up the preconditions for L(nondcbz) and go there.  */
174L(handletail32):
175	clrrwi.	rALIGN, rLEN, 5
176	b		L(nondcbz)
177
178	.align 5
179L(small):
180/* Memset of 8 bytes or less.  */
181	cmpldi	cr6, rLEN, 4
182	cmpldi	cr5, rLEN, 1
183	ble	cr6,L(le4)
184	subi	rLEN, rLEN, 4
185	stb	rCHR,0(rMEMP)
186	stb	rCHR,1(rMEMP)
187	stb	rCHR,2(rMEMP)
188	stb	rCHR,3(rMEMP)
189	addi	rMEMP,rMEMP, 4
190	cmpldi	cr5, rLEN, 1
191L(le4):
192	cmpldi	cr1, rLEN, 3
193	bltlr	cr5
194	stb	rCHR, 0(rMEMP)
195	beqlr	cr5
196	stb	rCHR, 1(rMEMP)
197	bltlr	cr1
198	stb	rCHR, 2(rMEMP)
199	beqlr	cr1
200	stb	rCHR, 3(rMEMP)
201	blr
202
203/* Memset of 0-31 bytes.  */
204	.align 5
205L(medium):
206	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
207	cmpldi	cr1, rLEN, 16
208L(medium_tail2):
209	add	rMEMP, rMEMP, rLEN
210L(medium_tail):
211	bt-	31, L(medium_31t)
212	bt-	30, L(medium_30t)
213L(medium_30f):
214	bt-	29, L(medium_29t)
215L(medium_29f):
216	bge-	cr1, L(medium_27t)
217	bflr-	28
218	std	rCHR, -8(rMEMP)
219	blr
220
221L(medium_31t):
222	stbu	rCHR, -1(rMEMP)
223	bf-	30, L(medium_30f)
224L(medium_30t):
225	sthu	rCHR, -2(rMEMP)
226	bf-	29, L(medium_29f)
227L(medium_29t):
228	stwu	rCHR, -4(rMEMP)
229	blt-	cr1, L(medium_27f)
230L(medium_27t):
231	std	rCHR, -8(rMEMP)
232	stdu	rCHR, -16(rMEMP)
233L(medium_27f):
234	bflr-	28
235L(medium_28t):
236	std	rCHR, -8(rMEMP)
237	blr
238END_GEN_TB (MEMSET,TB_TOCLESS)
239libc_hidden_builtin_def (memset)
240