1/* Optimized memset implementation for PowerPC64.
2   Copyright (C) 1997-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.
23
24   The memset is done in three sizes: byte (8 bits), word (32 bits),
25   cache line (1024 bits). There is a special case for setting cache lines
26   to 0, to take advantage of the dcbz instruction.  */
27
28	.machine power4
29EALIGN (memset, 5, 0)
30	CALL_MCOUNT
31
32#define rTMP	r0
33#define rRTN	r3	/* Initial value of 1st argument.  */
34#define rMEMP0	r3	/* Original value of 1st arg.  */
35#define rCHR	r4	/* Char to set in each byte.  */
36#define rLEN	r5	/* Length of region to set.  */
37#define rMEMP	r6	/* Address at which we are storing.  */
38#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
39#define rMEMP2	r8
40
41#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
42#define rCLS	r8	/* Cache line size (known to be 128).  */
43#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
44L(_memset):
45/* Take care of case for size <= 4.  */
46	cmplwi	cr1, rLEN, 4
47	andi.	rALIGN, rMEMP0, 3
48	mr	rMEMP, rMEMP0
49	ble-	cr1, L(small)
50
51/* Align to word boundary.  */
52	cmplwi	cr5, rLEN, 31
53	insrwi	rCHR, rCHR, 8, 16     /* Replicate byte to halfword.  */
54	beq+	L(aligned)
55	mtcrf	0x01, rMEMP0
56	subfic	rALIGN, rALIGN, 4
57	add	rMEMP, rMEMP, rALIGN
58	sub	rLEN, rLEN, rALIGN
59	bf+	31, L(g0)
60	stb	rCHR, 0(rMEMP0)
61	bt	30, L(aligned)
62L(g0):
63	sth	rCHR, -2(rMEMP)
64
65/* Handle the case of size < 31.  */
66L(aligned):
67	mtcrf	0x01, rLEN
68	insrwi	rCHR, rCHR, 16, 0    /* Replicate halfword to word.  */
69	ble	cr5, L(medium)
70/* Align to 32-byte boundary.  */
71	andi.	rALIGN, rMEMP, 0x1C
72	subfic	rALIGN, rALIGN, 0x20
73	beq	L(caligned)
74	mtcrf	0x01, rALIGN
75	add	rMEMP, rMEMP, rALIGN
76	sub	rLEN, rLEN, rALIGN
77	cmplwi	cr1, rALIGN, 0x10
78	mr	rMEMP2, rMEMP
79	bf	28, L(a1)
80        stw     rCHR, -4(rMEMP2)
81	stwu	rCHR, -8(rMEMP2)
82L(a1):	blt	cr1, L(a2)
83        stw     rCHR, -4(rMEMP2)
84	stw	rCHR, -8(rMEMP2)
85	stw	rCHR, -12(rMEMP2)
86	stwu	rCHR, -16(rMEMP2)
87L(a2):  bf      29, L(caligned)
88        stw     rCHR, -4(rMEMP2)
89
90/* Now aligned to a 32 byte boundary.  */
91L(caligned):
92	cmplwi	cr1, rCHR, 0
93	clrrwi.	rALIGN, rLEN, 5
94	mtcrf	0x01, rLEN
95	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
96L(nondcbz):
97	srwi	rTMP, rALIGN, 5
98	mtctr	rTMP
99	beq	L(medium)	/* We may not actually get to do a full line.  */
100	clrlwi.	rLEN, rLEN, 27
101	add	rMEMP, rMEMP, rALIGN
102	li	rNEG64, -0x40
103	bdz	L(cloopdone)
104
105        .align 4
106L(c3): 	dcbtst	rNEG64, rMEMP
107        stw     rCHR, -4(rMEMP)
108	stw	rCHR, -8(rMEMP)
109        stw     rCHR, -12(rMEMP)
110	stw	rCHR, -16(rMEMP)
111        stw     rCHR, -20(rMEMP)
112	stw	rCHR, -24(rMEMP)
113        stw     rCHR, -28(rMEMP)
114	stwu	rCHR, -32(rMEMP)
115	bdnz	L(c3)
116L(cloopdone):
117        stw     rCHR, -4(rMEMP)
118	stw	rCHR, -8(rMEMP)
119        stw     rCHR, -12(rMEMP)
120	stw	rCHR, -16(rMEMP)
121	cmplwi	cr1, rLEN, 16
122        stw     rCHR, -20(rMEMP)
123	stw	rCHR, -24(rMEMP)
124        stw     rCHR, -28(rMEMP)
125	stwu	rCHR, -32(rMEMP)
126	beqlr
127	add	rMEMP, rMEMP, rALIGN
128	b	L(medium_tail2)
129
130	.align 5
131/* Clear lines of memory in 128-byte chunks.  */
132L(zloopstart):
133/* If the remaining length is less the 32 bytes, don't bother getting
134	 the cache line size.  */
135	beq	L(medium)
136	li      rCLS,128  /* cache line size is 128 */
137	dcbt	0,rMEMP
138L(getCacheAligned):
139	cmplwi	cr1,rLEN,32
140	andi.	rTMP,rMEMP,127
141	blt	cr1,L(handletail32)
142	beq	L(cacheAligned)
143	addi	rMEMP,rMEMP,32
144	addi	rLEN,rLEN,-32
145	stw	rCHR,-32(rMEMP)
146        stw     rCHR,-28(rMEMP)
147	stw	rCHR,-24(rMEMP)
148	stw     rCHR,-20(rMEMP)
149	stw	rCHR,-16(rMEMP)
150        stw     rCHR,-12(rMEMP)
151	stw	rCHR,-8(rMEMP)
152        stw     rCHR,-4(rMEMP)
153	b	L(getCacheAligned)
154
155/* Now we are aligned to the cache line and can use dcbz.  */
156        .align 4
157L(cacheAligned):
158	cmplw	cr1,rLEN,rCLS
159	blt	cr1,L(handletail32)
160	dcbz	0,rMEMP
161	subf	rLEN,rCLS,rLEN
162	add	rMEMP,rMEMP,rCLS
163	b	L(cacheAligned)
164
165/* We are here because the cache line size was set and the remainder
166  (rLEN) is less than the actual cache line size.
167   So set up the preconditions for L(nondcbz) and go there.  */
168L(handletail32):
169	clrrwi.	rALIGN, rLEN, 5
170	b		L(nondcbz)
171
172	.align 5
173L(small):
174/* Memset of 4 bytes or less.  */
175	cmplwi	cr5, rLEN, 1
176	cmplwi	cr1, rLEN, 3
177	bltlr	cr5
178	stb	rCHR, 0(rMEMP)
179	beqlr	cr5
180	stb	rCHR, 1(rMEMP)
181	bltlr	cr1
182	stb	rCHR, 2(rMEMP)
183	beqlr	cr1
184	stb	rCHR, 3(rMEMP)
185	blr
186
187/* Memset of 0-31 bytes.  */
188	.align 5
189L(medium):
190	cmplwi	cr1, rLEN, 16
191L(medium_tail2):
192	add	rMEMP, rMEMP, rLEN
193L(medium_tail):
194	bt-	31, L(medium_31t)
195	bt-	30, L(medium_30t)
196L(medium_30f):
197	bt-	29, L(medium_29t)
198L(medium_29f):
199	bge-	cr1, L(medium_27t)
200	bflr-	28
201        stw     rCHR, -4(rMEMP)
202	stw	rCHR, -8(rMEMP)
203	blr
204
205L(medium_31t):
206	stbu	rCHR, -1(rMEMP)
207	bf-	30, L(medium_30f)
208L(medium_30t):
209	sthu	rCHR, -2(rMEMP)
210	bf-	29, L(medium_29f)
211L(medium_29t):
212	stwu	rCHR, -4(rMEMP)
213	blt-	cr1, L(medium_27f)
214L(medium_27t):
215        stw     rCHR, -4(rMEMP)
216	stw	rCHR, -8(rMEMP)
217        stw     rCHR, -12(rMEMP)
218	stwu	rCHR, -16(rMEMP)
219L(medium_27f):
220	bflr-	28
221L(medium_28t):
222        stw     rCHR, -4(rMEMP)
223	stw	rCHR, -8(rMEMP)
224	blr
225END (memset)
226libc_hidden_builtin_def (memset)
227