1/* Optimized memset implementation for PowerPC64.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <rtld-global-offsets.h>
21
22	.section	".toc","aw"
23__GLRO_DEF(dl_cache_line_size)
24
25	.section	".text"
26	.align 2
27
28/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
29   Returns 's'.
30
31   The memset is done in three sizes: byte (8 bits), word (32 bits),
32   cache line (256 bits). There is a special case for setting cache lines
33   to 0, to take advantage of the dcbz instruction.  */
34
35#ifndef MEMSET
36# define MEMSET memset
37#endif
38
39ENTRY (MEMSET, 5)
40	CALL_MCOUNT 3
41
42#define rTMP	r0
43#define rRTN	r3	/* Initial value of 1st argument.  */
44#define rMEMP0	r3	/* Original value of 1st arg.  */
45#define rCHR	r4	/* Char to set in each byte.  */
46#define rLEN	r5	/* Length of region to set.  */
47#define rMEMP	r6	/* Address at which we are storing.  */
48#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
49#define rMEMP2	r8
50
51#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
52#define rCLS	r8	/* Cache line size obtained from static.  */
53#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
54L(_memset):
55/* Take care of case for size <= 4.  */
56	cmpldi	cr1, rLEN, 8
57	andi.	rALIGN, rMEMP0, 7
58	mr	rMEMP, rMEMP0
59	ble-	cr1, L(small)
60
61/* Align to doubleword boundary.  */
62	cmpldi	cr5, rLEN, 31
63	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
64	beq+	L(aligned2)
65	mtcrf	0x01, rMEMP0
66	subfic	rALIGN, rALIGN, 8
67	cror	28,30,31		/* Detect odd word aligned.  */
68	add	rMEMP, rMEMP, rALIGN
69	sub	rLEN, rLEN, rALIGN
70	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
71	bt	29, L(g4)
72/* Process the even word of doubleword.  */
73	bf+	31, L(g2)
74	stb	rCHR, 0(rMEMP0)
75	bt	30, L(g4x)
76L(g2):
77	sth	rCHR, -6(rMEMP)
78L(g4x):
79	stw	rCHR, -4(rMEMP)
80	b	L(aligned)
81/* Process the odd word of doubleword.  */
82L(g4):
83	bf	28, L(g4x) /* If false, word aligned on odd word.  */
84	bf+	31, L(g0)
85	stb	rCHR, 0(rMEMP0)
86	bt	30, L(aligned)
87L(g0):
88	sth	rCHR, -2(rMEMP)
89
90/* Handle the case of size < 31.  */
91L(aligned2):
92	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
93L(aligned):
94	mtcrf	0x01, rLEN
95	ble	cr5, L(medium)
96/* Align to 32-byte boundary.  */
97	andi.	rALIGN, rMEMP, 0x18
98	subfic	rALIGN, rALIGN, 0x20
99	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
100	beq	L(caligned)
101	mtcrf	0x01, rALIGN
102	add	rMEMP, rMEMP, rALIGN
103	sub	rLEN, rLEN, rALIGN
104	cmplwi	cr1, rALIGN, 0x10
105	mr	rMEMP2, rMEMP
106	bf	28, L(a1)
107	stdu	rCHR, -8(rMEMP2)
108L(a1):	blt	cr1, L(a2)
109	std	rCHR, -8(rMEMP2)
110	stdu	rCHR, -16(rMEMP2)
111L(a2):
112
113/* Now aligned to a 32 byte boundary.  */
114L(caligned):
115	cmpldi	cr1, rCHR, 0
116	clrrdi.	rALIGN, rLEN, 5
117	mtcrf	0x01, rLEN
118	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
119L(nondcbz):
120	srdi	rTMP, rALIGN, 5
121	mtctr	rTMP
122	beq	L(medium)	/* We may not actually get to do a full line.  */
123	clrldi.	rLEN, rLEN, 59
124	add	rMEMP, rMEMP, rALIGN
125	li	rNEG64, -0x40
126	bdz	L(cloopdone)
127
128L(c3):	dcbtst	rNEG64, rMEMP
129	std	rCHR, -8(rMEMP)
130	std	rCHR, -16(rMEMP)
131	std	rCHR, -24(rMEMP)
132	stdu	rCHR, -32(rMEMP)
133	bdnz	L(c3)
134L(cloopdone):
135	std	rCHR, -8(rMEMP)
136	std	rCHR, -16(rMEMP)
137	cmpldi	cr1, rLEN, 16
138	std	rCHR, -24(rMEMP)
139	stdu	rCHR, -32(rMEMP)
140	beqlr
141	add	rMEMP, rMEMP, rALIGN
142	b	L(medium_tail2)
143
144	.align 5
145/* Clear lines of memory in 128-byte chunks.  */
146L(zloopstart):
147/* If the remaining length is less the 32 bytes, don't bother getting
148	 the cache line size.  */
149	beq	L(medium)
150	/* Read the cache line size.  */
151	__GLRO (rCLS, dl_cache_line_size,
152		RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
153
154/* If the cache line size was not set just goto to L(nondcbz) which is
155	 safe for any cache line size.  */
156	cmpldi	cr1,rCLS,0
157	beq		cr1,L(nondcbz)
158
159
160/* Now we know the cache line size, and it is not 32-bytes, but
161	 we may not yet be aligned to the cache line. May have a partial
162	 line to fill, so touch it 1st.  */
163	dcbt	0,rMEMP
164	addi	rCLM,rCLS,-1
165L(getCacheAligned):
166	cmpldi	cr1,rLEN,32
167	and.	rTMP,rCLM,rMEMP
168	blt		cr1,L(handletail32)
169	beq		L(cacheAligned)
170	addi	rMEMP,rMEMP,32
171	addi	rLEN,rLEN,-32
172	std		rCHR,-32(rMEMP)
173	std		rCHR,-24(rMEMP)
174	std		rCHR,-16(rMEMP)
175	std		rCHR,-8(rMEMP)
176	b		L(getCacheAligned)
177
178/* Now we are aligned to the cache line and can use dcbz.  */
179L(cacheAligned):
180	cmpld	cr1,rLEN,rCLS
181	blt		cr1,L(handletail32)
182	dcbz	0,rMEMP
183	subf	rLEN,rCLS,rLEN
184	add		rMEMP,rMEMP,rCLS
185	b		L(cacheAligned)
186
187/* We are here because the cache line size was set and was not 32-bytes
188   and the remainder (rLEN) is less than the actual cache line size.
189   So set up the preconditions for L(nondcbz) and go there.  */
190L(handletail32):
191	clrrwi.	rALIGN, rLEN, 5
192	b		L(nondcbz)
193
194	.align 5
195L(small):
196/* Memset of 8 bytes or less.  */
197	cmpldi	cr6, rLEN, 4
198	cmpldi	cr5, rLEN, 1
199	ble	cr6,L(le4)
200	subi	rLEN, rLEN, 4
201	stb	rCHR,0(rMEMP)
202	stb	rCHR,1(rMEMP)
203	stb	rCHR,2(rMEMP)
204	stb	rCHR,3(rMEMP)
205	addi	rMEMP,rMEMP, 4
206	cmpldi	cr5, rLEN, 1
207L(le4):
208	cmpldi	cr1, rLEN, 3
209	bltlr	cr5
210	stb	rCHR, 0(rMEMP)
211	beqlr	cr5
212	stb	rCHR, 1(rMEMP)
213	bltlr	cr1
214	stb	rCHR, 2(rMEMP)
215	beqlr	cr1
216	stb	rCHR, 3(rMEMP)
217	blr
218
219/* Memset of 0-31 bytes.  */
220	.align 5
221L(medium):
222	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
223	cmpldi	cr1, rLEN, 16
224L(medium_tail2):
225	add	rMEMP, rMEMP, rLEN
226L(medium_tail):
227	bt-	31, L(medium_31t)
228	bt-	30, L(medium_30t)
229L(medium_30f):
230	bt-	29, L(medium_29t)
231L(medium_29f):
232	bge-	cr1, L(medium_27t)
233	bflr-	28
234	std	rCHR, -8(rMEMP)
235	blr
236
237L(medium_31t):
238	stbu	rCHR, -1(rMEMP)
239	bf-	30, L(medium_30f)
240L(medium_30t):
241	sthu	rCHR, -2(rMEMP)
242	bf-	29, L(medium_29f)
243L(medium_29t):
244	stwu	rCHR, -4(rMEMP)
245	blt-	cr1, L(medium_27f)
246L(medium_27t):
247	std	rCHR, -8(rMEMP)
248	stdu	rCHR, -16(rMEMP)
249L(medium_27f):
250	bflr-	28
251L(medium_28t):
252	std	rCHR, -8(rMEMP)
253	blr
254END_GEN_TB (MEMSET,TB_TOCLESS)
255libc_hidden_builtin_def (memset)
256