1/* Optimized memset for Fujitsu A64FX processor.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library.  If not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include <sysdeps/aarch64/memset-reg.h>
22
23/* Assumptions:
24 *
25 * ARMv8.2-a, AArch64, unaligned accesses, sve
26 *
27 */
28
29#define L1_SIZE		(64*1024)	// L1 64KB
30#define L2_SIZE         (8*1024*1024)	// L2 8MB
31#define CACHE_LINE_SIZE	256
32#define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
33#define vector_length	x9
34
35#if HAVE_AARCH64_SVE_ASM
36# if IS_IN (libc)
37#  define MEMSET __memset_a64fx
38
39	.arch armv8.2-a+sve
40
41	.macro st1b_unroll first=0, last=7
42	st1b	z0.b, p0, [dst, \first, mul vl]
43	.if \last-\first
44	st1b_unroll "(\first+1)", \last
45	.endif
46	.endm
47
48
49#undef BTI_C
50#define BTI_C
51
52ENTRY (MEMSET)
53	PTR_ARG (0)
54	SIZE_ARG (2)
55
56	cntb	vector_length
57	dup	z0.b, valw
58	whilelo	p0.b, vector_length, count
59	b.last	1f
60	whilelo	p1.b, xzr, count
61	st1b	z0.b, p1, [dstin, 0, mul vl]
62	st1b	z0.b, p0, [dstin, 1, mul vl]
63	ret
64
65	// count >= vector_length * 2
661:	cmp	count, vector_length, lsl 2
67	add	dstend, dstin, count
68	b.hi	1f
69	st1b	z0.b, p0, [dstin, 0, mul vl]
70	st1b	z0.b, p0, [dstin, 1, mul vl]
71	st1b	z0.b, p0, [dstend, -2, mul vl]
72	st1b	z0.b, p0, [dstend, -1, mul vl]
73	ret
74
75	// count > vector_length * 4
761:	lsl	tmp1, vector_length, 3
77	cmp	count, tmp1
78	b.hi	L(vl_agnostic)
79	st1b	z0.b, p0, [dstin, 0, mul vl]
80	st1b	z0.b, p0, [dstin, 1, mul vl]
81	st1b	z0.b, p0, [dstin, 2, mul vl]
82	st1b	z0.b, p0, [dstin, 3, mul vl]
83	st1b	z0.b, p0, [dstend, -4, mul vl]
84	st1b	z0.b, p0, [dstend, -3, mul vl]
85	st1b	z0.b, p0, [dstend, -2, mul vl]
86	st1b	z0.b, p0, [dstend, -1, mul vl]
87	ret
88
89	.p2align 4
90L(vl_agnostic): // VL Agnostic
91	mov	dst, dstin
92	cmp	count, L1_SIZE
93	b.hi	L(L1_prefetch)
94
95	// count >= 8 * vector_length
96L(unroll8):
97	sub	count, count, tmp1
98	.p2align 4
99	// The 2 instructions at the beginning of the following loop,
100	// cmp and branch, are a workaround so as not to degrade at
101	// the peak performance 16KB.
102	// It is found heuristically and the branch condition, b.ne,
103	// is chosen intentionally never to jump.
1041:	cmp	xzr, xzr
105	b.ne	1b
106	st1b_unroll 0, 7
107	add	dst, dst, tmp1
108	subs	count, count, tmp1
109	b.hi	1b
110	add	count, count, tmp1
111
112L(last):
113	cmp	count, vector_length, lsl 1
114	b.ls	2f
115	add	tmp2, vector_length, vector_length, lsl 2
116	cmp	count, tmp2
117	b.ls	5f
118	st1b	z0.b, p0, [dstend, -8, mul vl]
119	st1b	z0.b, p0, [dstend, -7, mul vl]
120	st1b	z0.b, p0, [dstend, -6, mul vl]
1215:	st1b	z0.b, p0, [dstend, -5, mul vl]
122	st1b	z0.b, p0, [dstend, -4, mul vl]
123	st1b	z0.b, p0, [dstend, -3, mul vl]
1242:	st1b	z0.b, p0, [dstend, -2, mul vl]
125	st1b	z0.b, p0, [dstend, -1, mul vl]
126	ret
127
128	// count >= L1_SIZE
129	.p2align 3
130L(L1_prefetch):
131	cmp	count, L2_SIZE
132	b.hs	L(L2)
133	cmp	vector_length, 64
134	b.ne	L(unroll8)
1351:	st1b_unroll 0, 3
136	prfm	pstl1keep, [dst, PF_DIST_L1]
137	st1b_unroll 4, 7
138	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
139	add	dst, dst, CACHE_LINE_SIZE * 2
140	sub	count, count, CACHE_LINE_SIZE * 2
141	cmp	count, PF_DIST_L1
142	b.hs	1b
143	b	L(unroll8)
144
145	// count >= L2_SIZE
146	.p2align 3
147L(L2):
148	tst	valw, 255
149	b.ne	L(unroll8)
150        // align dst to CACHE_LINE_SIZE byte boundary
151	and	tmp2, dst, CACHE_LINE_SIZE - 1
152	st1b	z0.b, p0, [dst, 0, mul vl]
153	st1b	z0.b, p0, [dst, 1, mul vl]
154	st1b	z0.b, p0, [dst, 2, mul vl]
155	st1b	z0.b, p0, [dst, 3, mul vl]
156	sub	dst, dst, tmp2
157	add	count, count, tmp2
158
159	// clear cachelines using DC ZVA
160	sub	count, count, CACHE_LINE_SIZE * 2
161	.p2align 4
1621:	add	dst, dst, CACHE_LINE_SIZE
163	dc	zva, dst
164	subs	count, count, CACHE_LINE_SIZE
165	b.hi	1b
166	add	count, count, CACHE_LINE_SIZE
167	b	L(last)
168
169END (MEMSET)
170libc_hidden_builtin_def (MEMSET)
171
172#endif /* IS_IN (libc) */
173#endif /* HAVE_AARCH64_SVE_ASM */
174