1/* Copyright (C) 2000-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library.  If not, see
16   <https://www.gnu.org/licenses/>.  */
17
18#include <sysdep.h>
19
20	.arch ev6
21	.set noat
22	.set noreorder
23
24ENTRY(memset)
25#ifdef PROF
26	ldgp	gp, 0(pv)
27	lda	AT, _mcount
28	jsr	AT, (AT), _mcount
29	.prologue 1
30#else
31	.prologue 0
32#endif
33
34	/*
35	 * Serious stalling happens.  The only way to mitigate this is to
36	 * undertake a major re-write to interleave the constant materialization
37	 * with other parts of the fall-through code.  This is important, even
38	 * though it makes maintenance tougher.
39	 * Do this later.
40	 */
41	and	$17, 255, $1	# E : 00000000000000ch
42	insbl	$17, 1, $2	# U : 000000000000ch00
43	mov	$16, $0		# E : return value
44	ble	$18, $end	# U : zero length requested?
45
46	addq	$18, $16, $6	# E : max address to write to
47	or	$1, $2, $17	# E : 000000000000chch
48	insbl	$1, 2, $3	# U : 0000000000ch0000
49	insbl	$1, 3, $4	# U : 00000000ch000000
50
51	or	$3, $4, $3	# E : 00000000chch0000
52	inswl	$17, 4, $5	# U : 0000chch00000000
53	xor	$16, $6, $1	# E : will complete write be within one quadword?
54	inswl	$17, 6, $2	# U : chch000000000000
55
56	or	$17, $3, $17	# E : 00000000chchchch
57	or	$2, $5, $2	# E : chchchch00000000
58	bic	$1, 7, $1	# E : fit within a single quadword?
59	and	$16, 7, $3	# E : Target addr misalignment
60
61	or	$17, $2, $17	# E : chchchchchchchch
62	beq	$1, $within_quad # U :
63	nop			# E :
64	beq	$3, $aligned	# U : target is 0mod8
65
66	/*
67	 * Target address is misaligned, and won't fit within a quadword.
68	 */
69	ldq_u	$4, 0($16)	# L : Fetch first partial
70	mov	$16, $5		# E : Save the address
71	insql	$17, $16, $2	# U : Insert new bytes
72	subq	$3, 8, $3	# E : Invert (for addressing uses)
73
74	addq	$18, $3, $18	# E : $18 is new count ($3 is negative)
75	mskql	$4, $16, $4	# U : clear relevant parts of the quad
76	subq	$16, $3, $16	# E : $16 is new aligned destination
77	or	$2, $4, $1	# E : Final bytes
78
79	nop
80	stq_u	$1,0($5)	# L : Store result
81	nop
82	nop
83
84	.align 4
85$aligned:
86	/*
87	 * We are now guaranteed to be quad aligned, with at least
88	 * one partial quad to write.
89	 */
90
91	sra	$18, 3, $3	# U : Number of remaining quads to write
92	and	$18, 7, $18	# E : Number of trailing bytes to write
93	mov	$16, $5		# E : Save dest address
94	beq	$3, $no_quad	# U : tail stuff only
95
96	/*
97	 * It's worth the effort to unroll this and use wh64 if possible.
98	 * At this point, entry values are:
99	 * $16	Current destination address
100	 * $5	A copy of $16
101	 * $6	The max quadword address to write to
102	 * $18	Number trailer bytes
103	 * $3	Number quads to write
104	 */
105
106	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
107	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
108	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
109	blt	$4, $loop	# U :
110
111	/*
112	 * We know we've got at least 16 quads, minimum of one trip
113	 * through unrolled loop.  Do a quad at a time to get us 0mod64
114	 * aligned.
115	 */
116
117	nop			# E :
118	nop			# E :
119	nop			# E :
120	beq	$1, $bigalign	# U :
121
122$alignmod64:
123	stq	$17, 0($5)	# L :
124	subq	$3, 1, $3	# E : For consistency later
125	addq	$1, 8, $1	# E : Increment towards zero for alignment
126	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
127
128	nop
129	nop
130	addq	$5, 8, $5	# E : Inc address
131	blt	$1, $alignmod64 # U :
132
133$bigalign:
134	/*
135	 * $3 - number quads left to go
136	 * $5 - target address (aligned 0mod64)
137	 * $17 - mask of stuff to store
138	 * Scratch registers available: $7, $2, $4, $1
139	 * We know that we'll be taking a minimum of one trip through.
140	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
141	 * Assumes the wh64 needs to be for 2 trips through the loop in the future.
142	 * The wh64 is issued on for the starting destination address for trip +2
143	 * through the loop, and if there are less than two trips left, the target
144	 * address will be for the current trip.
145	 */
146
147$do_wh64:
148	wh64	($4)		# L1 : memory subsystem write hint
149	subq	$3, 24, $2	# E : For determining future wh64 addresses
150	stq	$17, 0($5)	# L :
151	nop			# E :
152
153	addq	$5, 128, $4	# E : speculative target of next wh64
154	stq	$17, 8($5)	# L :
155	stq	$17, 16($5)	# L :
156	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
157
158	stq	$17, 24($5)	# L :
159	stq	$17, 32($5)	# L :
160	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
161	nop
162
163	stq	$17, 40($5)	# L :
164	stq	$17, 48($5)	# L :
165	subq	$3, 16, $2	# E : Repeat the loop at least once more?
166	nop
167
168	stq	$17, 56($5)	# L :
169	addq	$5, 64, $5	# E :
170	subq	$3, 8, $3	# E :
171	bge	$2, $do_wh64	# U :
172
173	nop
174	nop
175	nop
176	beq	$3, $no_quad	# U : Might have finished already
177
178	.align 4
179	/*
180	 * Simple loop for trailing quadwords, or for small amounts
181	 * of data (where we can't use an unrolled loop and wh64)
182	 */
183$loop:
184	stq	$17, 0($5)	# L :
185	subq	$3, 1, $3	# E : Decrement number quads left
186	addq	$5, 8, $5	# E : Inc address
187	bne	$3, $loop	# U : more?
188
189$no_quad:
190	/*
191	 * Write 0..7 trailing bytes.
192	 */
193	nop			# E :
194	beq	$18, $end	# U : All done?
195	ldq	$7, 0($5)	# L :
196	mskqh	$7, $6, $2	# U : Mask final quad
197
198	insqh	$17, $6, $4	# U : New bits
199	or	$2, $4, $1	# E : Put it all together
200	stq	$1, 0($5)	# L : And back to memory
201	ret	$31,($26),1	# L0 :
202
203$within_quad:
204	ldq_u	$1, 0($16)	# L :
205	insql	$17, $16, $2	# U : New bits
206	mskql	$1, $16, $4	# U : Clear old
207	or	$2, $4, $2	# E : New result
208
209	mskql	$2, $6, $4	# U :
210	mskqh	$1, $6, $2	# U :
211	or	$2, $4, $1	# E :
212	stq_u	$1, 0($16)	# L :
213
214$end:
215	nop
216	nop
217	nop
218	ret $31,($26),1		# L0 :
219
220	END(memset)
221libc_hidden_builtin_def (memset)
222