1/* memset optimized with AVX512 for KNL hardware.
2   Copyright (C) 2015-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <isa-level.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24
25#include "asm-syntax.h"
26#ifndef MEMSET
27# define MEMSET __memset_avx512_no_vzeroupper
28# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
29#endif
30
31	.section .text.avx512,"ax",@progbits
32#if defined PIC
33ENTRY (MEMSET_CHK)
34	cmp	%RDX_LP, %RCX_LP
35	jb	HIDDEN_JUMPTARGET (__chk_fail)
36END (MEMSET_CHK)
37#endif
38
39ENTRY (MEMSET)
40# ifdef __ILP32__
41	/* Clear the upper 32 bits.  */
42	mov	%edx, %edx
43# endif
44	vpxor	%xmm0, %xmm0, %xmm0
45	vmovd	%esi, %xmm1
46	lea	(%rdi, %rdx), %rsi
47	mov	%rdi, %rax
48	vpshufb	%xmm0, %xmm1, %xmm0
49	cmp	$16, %rdx
50	jb	L(less_16bytes)
51	cmp	$512, %rdx
52	vbroadcastss	%xmm0, %zmm2
53	ja	L(512bytesormore)
54	cmp	$256, %rdx
55	jb	L(less_256bytes)
56	vmovups	%zmm2, (%rdi)
57	vmovups %zmm2, 0x40(%rdi)
58	vmovups %zmm2, 0x80(%rdi)
59	vmovups %zmm2, 0xC0(%rdi)
60	vmovups %zmm2, -0x100(%rsi)
61	vmovups %zmm2, -0xC0(%rsi)
62	vmovups %zmm2, -0x80(%rsi)
63	vmovups %zmm2, -0x40(%rsi)
64	ret
65
66L(less_256bytes):
67	cmp	$128, %dl
68	jb	L(less_128bytes)
69	vmovups	%zmm2, (%rdi)
70	vmovups %zmm2, 0x40(%rdi)
71	vmovups %zmm2, -0x80(%rsi)
72	vmovups %zmm2, -0x40(%rsi)
73	ret
74
75L(less_128bytes):
76	cmp	$64, %dl
77	jb	L(less_64bytes)
78	vmovups	%zmm2, (%rdi)
79	vmovups	%zmm2, -0x40(%rsi)
80	ret
81
82L(less_64bytes):
83	cmp	$32, %dl
84	jb	L(less_32bytes)
85	vmovdqu	%ymm2, (%rdi)
86	vmovdqu %ymm2, -0x20(%rsi)
87	ret
88
89L(less_32bytes):
90	vmovdqu %xmm0, (%rdi)
91	vmovdqu %xmm0, -0x10(%rsi)
92	ret
93
94L(less_16bytes):
95	cmp	$8, %dl
96	jb	L(less_8bytes)
97	vmovq	%xmm0, (%rdi)
98	vmovq	%xmm0, -0x08(%rsi)
99	ret
100
101L(less_8bytes):
102	vmovd	%xmm0, %ecx
103	cmp	$4, %dl
104	jb	L(less_4bytes)
105	mov	%ecx, (%rdi)
106	mov	%ecx, -0x04(%rsi)
107	ret
108
109L(less_4bytes):
110	cmp	$2, %dl
111	jb	L(less_2bytes)
112	mov	%cx, (%rdi)
113	mov	%cx, -0x02(%rsi)
114	ret
115
116L(less_2bytes):
117	cmp	$1, %dl
118	jb	L(less_1bytes)
119	mov	%cl, (%rdi)
120L(less_1bytes):
121	ret
122
123L(512bytesormore):
124	mov	__x86_shared_cache_size_half(%rip), %rcx
125	cmp	%rcx, %rdx
126	ja	L(preloop_large)
127	cmp	$1024, %rdx
128	ja	L(1024bytesormore)
129
130	vmovups	%zmm2, (%rdi)
131	vmovups	%zmm2, 0x40(%rdi)
132	vmovups	%zmm2, 0x80(%rdi)
133	vmovups	%zmm2, 0xC0(%rdi)
134	vmovups	%zmm2, 0x100(%rdi)
135	vmovups	%zmm2, 0x140(%rdi)
136	vmovups	%zmm2, 0x180(%rdi)
137	vmovups	%zmm2, 0x1C0(%rdi)
138	vmovups %zmm2, -0x200(%rsi)
139	vmovups %zmm2, -0x1C0(%rsi)
140	vmovups %zmm2, -0x180(%rsi)
141	vmovups %zmm2, -0x140(%rsi)
142	vmovups %zmm2, -0x100(%rsi)
143	vmovups %zmm2, -0xC0(%rsi)
144	vmovups %zmm2, -0x80(%rsi)
145	vmovups %zmm2, -0x40(%rsi)
146	ret
147
148/* Align on 64 and loop with aligned stores.  */
149L(1024bytesormore):
150	sub	$0x100, %rsi
151	vmovups	%zmm2, (%rax)
152	and	$-0x40, %rdi
153	add	$0x40, %rdi
154
155L(gobble_256bytes_loop):
156	vmovaps	%zmm2, (%rdi)
157	vmovaps	%zmm2, 0x40(%rdi)
158	vmovaps	%zmm2, 0x80(%rdi)
159	vmovaps	%zmm2, 0xC0(%rdi)
160	add	$0x100, %rdi
161	cmp	%rsi, %rdi
162	jb	L(gobble_256bytes_loop)
163	vmovups %zmm2, (%rsi)
164	vmovups %zmm2, 0x40(%rsi)
165	vmovups %zmm2, 0x80(%rsi)
166	vmovups %zmm2, 0xC0(%rsi)
167	ret
168
169/* Align on 128 and loop with non-temporal stores.  */
170L(preloop_large):
171	and	$-0x80, %rdi
172	add	$0x80, %rdi
173	vmovups	%zmm2, (%rax)
174	vmovups	%zmm2, 0x40(%rax)
175	sub	$0x200, %rsi
176
177L(gobble_512bytes_nt_loop):
178	vmovntdq %zmm2, (%rdi)
179	vmovntdq %zmm2, 0x40(%rdi)
180	vmovntdq %zmm2, 0x80(%rdi)
181	vmovntdq %zmm2, 0xC0(%rdi)
182	vmovntdq %zmm2, 0x100(%rdi)
183	vmovntdq %zmm2, 0x140(%rdi)
184	vmovntdq %zmm2, 0x180(%rdi)
185	vmovntdq %zmm2, 0x1C0(%rdi)
186	add	$0x200, %rdi
187	cmp	%rsi, %rdi
188	jb	L(gobble_512bytes_nt_loop)
189	sfence
190	vmovups %zmm2, (%rsi)
191	vmovups %zmm2, 0x40(%rsi)
192	vmovups %zmm2, 0x80(%rsi)
193	vmovups %zmm2, 0xC0(%rsi)
194	vmovups	%zmm2, 0x100(%rsi)
195	vmovups	%zmm2, 0x140(%rsi)
196	vmovups	%zmm2, 0x180(%rsi)
197	vmovups	%zmm2, 0x1C0(%rsi)
198	ret
199END (MEMSET)
200#endif
201