1/* Set a block of memory to some byte value.  For SUN4V M7.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef XCC
22# define XCC    xcc
23#endif
24	.register	%g2, #scratch
25	.register	%g3, #scratch
26
27/* The algorithm is as follows :
28 *
29 *	For small 7 or fewer bytes stores, bytes will be stored.
30 *
31 *	For less than 32 bytes stores, align the address on 4 byte boundary.
32 *	Then store as many 4-byte chunks, followed by trailing bytes.
33 *
34 *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
35 *	if (count >= 64) {
36 *		store 8-bytes chunks to align the address on 64 byte boundary
37 *		if (value to be set is zero && count >= MIN_ZERO) {
38 *			Using BIS stores, set the first long word of each
39 *			64-byte cache line to zero which will also clear the
40 *			other seven long words of the cache line.
41 *		}
42 *		else if (count >= MIN_LOOP) {
43 *			Using BIS stores, set the first long word of each of
44 *			ST_CHUNK cache lines (64 bytes each) before the main
45 *			loop is entered.
46 *			In the main loop, continue pre-setting the first long
47 *			word of each cache line ST_CHUNK lines in advance while
48 *			setting the other seven long words (56 bytes) of each
49 *			cache line until fewer than ST_CHUNK*64 bytes remain.
50 *			Then set the remaining seven long words of each cache
51 *			line that has already had its first long word set.
52 *		}
53 *		store remaining data in 64-byte chunks until less than
54 *		64 bytes remain.
55 *	}
56 *	Store as many 8-byte chunks, followed by trailing bytes.
57 *
58 *
59 * BIS = Block Init Store
60 *   Doing the advance store of the first element of the cache line
61 *   initiates the displacement of a cache line while only using a single
62 *   instruction in the pipeline. That avoids various pipeline delays,
63 *   such as filling the miss buffer. The performance effect is
64 *   similar to prefetching for normal stores.
65 *   The special case for zero fills runs faster and uses fewer instruction
66 *   cycles than the normal memset loop.
67 *
68 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
69 * BIS stores must be followed by a membar #StoreStore. The benefit of
70 * the BIS store must be balanced against the cost of the membar operation.
71 */
72
73/*
74 * ASI_STBI_P marks the cache line as "least recently used"
75 * which means if many threads are active, it has a high chance
76 * of being pushed out of the cache between the first initializing
77 * store and the final stores.
78 * Thus, we use ASI_STBIMRU_P which marks the cache line as
79 * "most recently used" for all but the last store to the cache line.
80 */
81
82#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
83#define ASI_ST_BLK_INIT_MRU_P 0xf2
84
85#define ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
86#define ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P
87
88#define ST_CHUNK	24   /* multiple of 4 due to loop unrolling */
89#define MIN_LOOP	(ST_CHUNK)*64
90#define MIN_ZERO	256
91
92#define EX_ST(x)	x
93#define EX_RETVAL(x)	x
94#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
95#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
96
97#if IS_IN (libc)
98
99	.text
100	.align		32
101
102ENTRY(__memset_niagara7)
103	/* memset (src, c, size)  */
104	mov	%o0, %o5		/* copy sp1 before using it  */
105	cmp	%o2, 7			/* if small counts, just write bytes  */
106	bleu,pn %XCC, .Lwrchar
107	 and	%o1, 0xff, %o1		/* o1 is (char)c  */
108
109	sll	%o1, 8, %o3
110	or	%o1, %o3, %o1		/* now o1 has 2 bytes of c  */
111	sll	%o1, 16, %o3
112	cmp	%o2, 32
113	blu,pn	%XCC, .Lwdalign
114	 or	%o1, %o3, %o1		/* now o1 has 4 bytes of c  */
115
116	sllx	%o1, 32, %o3
117	or	%o1, %o3, %o1		/* now o1 has 8 bytes of c  */
118
119.Ldbalign:
120	andcc	%o5, 7, %o3		/* is sp1 aligned on a 8 byte bound?  */
121	bz,pt	%XCC, .Lblkalign	/* already long word aligned  */
122	 sub	%o3, 8, %o3		/* -(bytes till long word aligned)  */
123
124	add	%o2, %o3, %o2		/* update o2 with new count  */
125	/* Set -(%o3) bytes till sp1 long word aligned  */
1261:	stb	%o1, [%o5]		/* there is at least 1 byte to set  */
127	inccc	%o3			/* byte clearing loop   */
128	bl,pt	%XCC, 1b
129	 inc	%o5
130
131	/* Now sp1 is long word aligned (sp1 is found in %o5) */
132.Lblkalign:
133	cmp	%o2, 64		/* check if there are 64 bytes to set  */
134	blu,pn	%XCC, .Lwrshort
135	 mov	%o2, %o3
136
137	andcc	%o5, 63, %o3		/* is sp1 block aligned?  */
138	bz,pt	%XCC, .Lblkwr		/* now block aligned  */
139	 sub	%o3, 64, %o3		/* o3 is -(bytes till block aligned)  */
140	add	%o2, %o3, %o2		/* o2 is the remainder  */
141
142	/* Store -(%o3) bytes till dst is block (64 byte) aligned.  */
143	/* Use long word stores.  */
144	/* Recall that dst is already long word aligned  */
1451:
146	addcc	%o3, 8, %o3
147	stx	%o1, [%o5]
148	bl,pt	%XCC, 1b
149	 add	%o5, 8, %o5
150
151	/* Now sp1 is block aligned  */
152.Lblkwr:
153	andn	%o2, 63, %o4		/* calculate size of blocks in bytes  */
154	brz,pn	%o1, .Lwrzero		/* special case if c == 0  */
155	 and	%o2, 63, %o3		/* %o3 = bytes left after blk stores  */
156
157	cmp	%o4, MIN_LOOP		/* check for enough bytes to set  */
158	blu,pn	%XCC, .Lshort_set	/* to justify cost of membar   */
159	 nop				/* must be > pre-cleared lines  */
160
161	/* initial cache-clearing stores  */
162	/* get store pipeline moving  */
163
164/*	Primary memset loop for large memsets  */
165.Lwr_loop:
166	mov	ST_CHUNK, %g1
167.Lwr_loop_start:
168	subcc	%g1, 4, %g1
169	EX_ST(STORE_ASI(%o1,%o5))
170	add	%o5, 64, %o5
171	EX_ST(STORE_ASI(%o1,%o5))
172	add	%o5, 64, %o5
173	EX_ST(STORE_ASI(%o1,%o5))
174	add	%o5, 64, %o5
175	EX_ST(STORE_ASI(%o1,%o5))
176	bgu	%XCC, .Lwr_loop_start
177	 add	%o5, 64, %o5
178
179	sub	%o5, ST_CHUNK*64, %o5	/* reset %o5  */
180	mov	ST_CHUNK, %g1
181	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
182
183.Lwr_loop_rest:
184	stx	%o1,[%o5+8+8]
185	sub	%o4, 64, %o4
186	stx	%o1,[%o5+16+8]
187	subcc	%g1, 1, %g1
188	stx	%o1,[%o5+24+8]
189	stx	%o1,[%o5+32+8]
190	stx	%o1,[%o5+40+8]
191	add	%o5, 64, %o5
192	stx	%o1,[%o5-8]
193	bgu	%XCC, .Lwr_loop_rest
194	 EX_ST(STORE_INIT(%o1,%o5))
195
196	 add	%o5, 8, %o5		/* restore %o5 offset  */
197
198	/* If more than ST_CHUNK*64 bytes remain to set, continue  */
199	/* setting the first long word of each cache line in advance  */
200	/* to keep the store pipeline moving.  */
201
202	cmp	%o4, ST_CHUNK*64
203	bge,pt	%XCC, .Lwr_loop_start
204	 mov	ST_CHUNK, %g1
205
206	brz,a,pn %o4, .Lasi_done
207	 nop
208
209	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
210.Lwr_loop_small:
211	add	%o5, 8, %o5		/* adjust %o5 for ASI store  */
212	EX_ST(STORE_ASI(%o1,%o5))
213	stx	%o1,[%o5+8]
214	stx	%o1,[%o5+16]
215	stx	%o1,[%o5+24]
216	stx	%o1,[%o5+32]
217	subcc	%o4, 64, %o4
218	stx	%o1,[%o5+40]
219	add	%o5, 56, %o5
220	stx	%o1,[%o5-8]
221	bgu,pt	%XCC, .Lwr_loop_small
222	 EX_ST(STORE_INIT(%o1,%o5))
223
224	ba	.Lasi_done
225	 add	%o5, 8, %o5		/* restore %o5 offset  */
226
227/*	Special case loop for zero fill memsets  */
228/*	For each 64 byte cache line, single STBI to first element  */
229/*	clears line  */
230.Lwrzero:
231	cmp	%o4, MIN_ZERO		/* check if enough bytes to set  */
232					/* to pay %asi + membar cost  */
233	blu	%XCC, .Lshort_set
234	 nop
235	sub	%o4, 256, %o4
236
237.Lwrzero_loop:
238	mov	64, %g3
239	EX_ST(STORE_INIT(%o1,%o5))
240	subcc	%o4, 256, %o4
241	EX_ST(STORE_INIT(%o1,%o5+%g3))
242	add	%o5, 256, %o5
243	sub	%g3, 192, %g3
244	EX_ST(STORE_INIT(%o1,%o5+%g3))
245	add %g3, 64, %g3
246	bge,pt	%XCC, .Lwrzero_loop
247	 EX_ST(STORE_INIT(%o1,%o5+%g3))
248	add	%o4, 256, %o4
249
250	brz,pn	%o4, .Lbsi_done
251	 nop
252.Lwrzero_small:
253	EX_ST(STORE_INIT(%o1,%o5))
254	subcc	%o4, 64, %o4
255	bgu,pt	%XCC, .Lwrzero_small
256	 add	%o5, 64, %o5
257
258.Lasi_done:
259.Lbsi_done:
260	membar	#StoreStore		/* required by use of BSI  */
261
262.Lshort_set:
263	cmp	%o4, 64			/* check if 64 bytes to set  */
264	blu	%XCC, 5f
265	 nop
2664:					/* set final blocks of 64 bytes  */
267	stx	%o1, [%o5]
268	stx	%o1, [%o5+8]
269	stx	%o1, [%o5+16]
270	stx	%o1, [%o5+24]
271	subcc	%o4, 64, %o4
272	stx	%o1, [%o5+32]
273	stx	%o1, [%o5+40]
274	add	%o5, 64, %o5
275	stx	%o1, [%o5-16]
276	bgu,pt	%XCC, 4b
277	 stx	%o1, [%o5-8]
278
2795:
280	/* Set the remaining long words  */
281.Lwrshort:
282	subcc	%o3, 8, %o3		/* Can we store any long words?  */
283	blu,pn	%XCC, .Lwrchars
284	 and	%o2, 7, %o2		/* calc bytes left after long words  */
2856:
286	subcc	%o3, 8, %o3
287	stx	%o1, [%o5]		/* store the long words  */
288	bgeu,pt %XCC, 6b
289	 add	%o5, 8, %o5
290
291.Lwrchars:				/* check for extra chars  */
292	brnz	%o2, .Lwrfin
293	 nop
294	retl
295	 nop
296
297.Lwdalign:
298	andcc	%o5, 3, %o3		/* is sp1 aligned on a word boundary  */
299	bz,pn	%XCC, .Lwrword
300	 andn	%o2, 3, %o3		/* create word sized count in %o3  */
301
302	dec	%o2			/* decrement count  */
303	stb	%o1, [%o5]		/* clear a byte  */
304	b	.Lwdalign
305	 inc	%o5			/* next byte  */
306
307.Lwrword:
308	subcc	%o3, 4, %o3
309	st	%o1, [%o5]		/* 4-byte writing loop  */
310	bnz,pt	%XCC, .Lwrword
311	 add	%o5, 4, %o5
312	and	%o2, 3, %o2		/* leftover count, if any  */
313
314.Lwrchar:
315	/* Set the remaining bytes, if any  */
316	brz	%o2, .Lexit
317	 nop
318.Lwrfin:
319	deccc	%o2
320	stb	%o1, [%o5]
321	bgu,pt	%XCC, .Lwrfin
322	 inc	%o5
323.Lexit:
324	retl				/* %o0 was preserved  */
325	 nop
326END(__memset_niagara7)
327#endif
328