1/* memset with unaligned store and rep stosb
2   Copyright (C) 2016-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* memset is implemented as:
20   1. Use overlapping store to avoid branch.
21   2. If size is less than VEC, use integer register stores.
22   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25      4 VEC stores and store 4 * VEC at a time until done.  */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef XMM0
38# define XMM0				xmm0
39#endif
40
41#ifndef YMM0
42# define YMM0				ymm0
43#endif
44
45#ifndef VZEROUPPER
46# if VEC_SIZE > 16
47#  define VZEROUPPER			vzeroupper
48#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
49# else
50#  define VZEROUPPER
51# endif
52#endif
53
54#ifndef VZEROUPPER_SHORT_RETURN
55# define VZEROUPPER_SHORT_RETURN	rep; ret
56#endif
57
58#ifndef MOVQ
59# if VEC_SIZE > 16
60#  define MOVQ				vmovq
61#  define MOVD				vmovd
62# else
63#  define MOVQ				movq
64#  define MOVD				movd
65# endif
66#endif
67
68#if VEC_SIZE == 64
69# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
70#else
71# define LOOP_4X_OFFSET	(0)
72#endif
73
74#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
75# define END_REG	rcx
76# define LOOP_REG	rdi
77# define LESS_VEC_REG	rax
78#else
79# define END_REG	rdi
80# define LOOP_REG	rdx
81# define LESS_VEC_REG	rdi
82#endif
83
84#ifdef USE_XMM_LESS_VEC
85# define XMM_SMALL	1
86#else
87# define XMM_SMALL	0
88#endif
89
90#ifdef USE_LESS_VEC_MASK_STORE
91# define SET_REG64	rcx
92# define SET_REG32	ecx
93# define SET_REG16	cx
94# define SET_REG8	cl
95#else
96# define SET_REG64	rsi
97# define SET_REG32	esi
98# define SET_REG16	si
99# define SET_REG8	sil
100#endif
101
102#define PAGE_SIZE 4096
103
104/* Macro to calculate size of small memset block for aligning
105   purposes.  */
106#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
107
108
109#ifndef SECTION
110# error SECTION is not defined!
111#endif
112
113	.section SECTION(.text), "ax", @progbits
114#if IS_IN (libc)
115# if defined SHARED
116ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
117	cmp	%RDX_LP, %RCX_LP
118	jb	HIDDEN_JUMPTARGET (__chk_fail)
119END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
120# endif
121
122ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
123	shl	$2, %RDX_LP
124	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
125	WMEMSET_VDUP_TO_VEC0_LOW()
126	cmpq	$VEC_SIZE, %rdx
127	jb	L(less_vec_from_wmemset)
128	WMEMSET_VDUP_TO_VEC0_HIGH()
129	jmp	L(entry_from_wmemset)
130END (WMEMSET_SYMBOL (__wmemset, unaligned))
131#endif
132
133#if defined SHARED && IS_IN (libc)
134ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
135	cmp	%RDX_LP, %RCX_LP
136	jb	HIDDEN_JUMPTARGET (__chk_fail)
137END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
138#endif
139
140ENTRY (MEMSET_SYMBOL (__memset, unaligned))
141	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
142# ifdef __ILP32__
143	/* Clear the upper 32 bits.  */
144	mov	%edx, %edx
145# endif
146	cmpq	$VEC_SIZE, %rdx
147	jb	L(less_vec)
148	MEMSET_VDUP_TO_VEC0_HIGH()
149L(entry_from_wmemset):
150	cmpq	$(VEC_SIZE * 2), %rdx
151	ja	L(more_2x_vec)
152	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
153	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
154	VMOVU	%VEC(0), (%rdi)
155	VZEROUPPER_RETURN
156#if defined USE_MULTIARCH && IS_IN (libc)
157END (MEMSET_SYMBOL (__memset, unaligned))
158
159# if defined SHARED && IS_IN (libc)
160ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
161	cmp	%RDX_LP, %RCX_LP
162	jb	HIDDEN_JUMPTARGET (__chk_fail)
163END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
164# endif
165
166ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
167	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
168# ifdef __ILP32__
169	/* Clear the upper 32 bits.  */
170	mov	%edx, %edx
171# endif
172	cmp	$VEC_SIZE, %RDX_LP
173	jb	L(less_vec)
174	MEMSET_VDUP_TO_VEC0_HIGH ()
175	cmp	$(VEC_SIZE * 2), %RDX_LP
176	ja	L(stosb_more_2x_vec)
177	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
178	VMOVU	%VEC(0), (%rdi)
179	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
180	VZEROUPPER_RETURN
181#endif
182
183	.p2align 4,, 4
184L(last_2x_vec):
185#ifdef USE_LESS_VEC_MASK_STORE
186	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
187	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
188#else
189	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
190	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
191#endif
192	VZEROUPPER_RETURN
193
194	/* If have AVX512 mask instructions put L(less_vec) close to
195	   entry as it doesn't take much space and is likely a hot target.
196	 */
197#ifdef USE_LESS_VEC_MASK_STORE
198	.p2align 4,, 10
199L(less_vec):
200L(less_vec_from_wmemset):
201	/* Less than 1 VEC.  */
202# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
203#  error Unsupported VEC_SIZE!
204# endif
205	/* Clear high bits from edi. Only keeping bits relevant to page
206	   cross check. Note that we are using rax which is set in
207	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
208	andl	$(PAGE_SIZE - 1), %edi
209	/* Check if VEC_SIZE store cross page. Mask stores suffer
210	   serious performance degradation when it has to fault supress.
211	 */
212	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
213	/* This is generally considered a cold target.  */
214	ja	L(cross_page)
215# if VEC_SIZE > 32
216	movq	$-1, %rcx
217	bzhiq	%rdx, %rcx, %rcx
218	kmovq	%rcx, %k1
219# else
220	movl	$-1, %ecx
221	bzhil	%edx, %ecx, %ecx
222	kmovd	%ecx, %k1
223# endif
224	vmovdqu8 %VEC(0), (%rax){%k1}
225	VZEROUPPER_RETURN
226
227# if defined USE_MULTIARCH && IS_IN (libc)
228	/* Include L(stosb_local) here if including L(less_vec) between
229	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
230	   L(stosb_more_2x_vec) target.  */
231	.p2align 4,, 10
232L(stosb_local):
233	movzbl	%sil, %eax
234	mov	%RDX_LP, %RCX_LP
235	mov	%RDI_LP, %RDX_LP
236	rep	stosb
237	mov	%RDX_LP, %RAX_LP
238	VZEROUPPER_RETURN
239# endif
240#endif
241
242#if defined USE_MULTIARCH && IS_IN (libc)
243	.p2align 4
244L(stosb_more_2x_vec):
245	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
246	ja	L(stosb_local)
247#endif
248	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
249	   and (4x, 8x] jump to target.  */
250L(more_2x_vec):
251	/* Store next 2x vec regardless.  */
252	VMOVU	%VEC(0), (%rdi)
253	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
254
255
256	/* Two different methods of setting up pointers / compare. The two
257	   methods are based on the fact that EVEX/AVX512 mov instructions take
258	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
259	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
260	   address mode. For EVEX/AVX512 this saves code size and keeps a few
261	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
262	   bottlenecks.  */
263#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
264	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
265	addq	%rdx, %END_REG
266#endif
267
268	cmpq	$(VEC_SIZE * 4), %rdx
269	jbe	L(last_2x_vec)
270
271
272#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
273	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
274	   LEA_BID.  */
275
276	/* END_REG is rcx for EVEX/AVX512.  */
277	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
278#endif
279
280	/* Store next 2x vec regardless.  */
281	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
282	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
283
284
285#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
286	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
287	   extra offset to addresses in loop. Used for AVX512 to save space
288	   as no way to get (VEC_SIZE * 4) in imm8.  */
289# if LOOP_4X_OFFSET == 0
290	subq	$-(VEC_SIZE * 4), %LOOP_REG
291# endif
292	/* Avoid imm32 compare here to save code size.  */
293	cmpq	%rdi, %rcx
294#else
295	addq	$-(VEC_SIZE * 4), %END_REG
296	cmpq	$(VEC_SIZE * 8), %rdx
297#endif
298	jbe	L(last_4x_vec)
299#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
300	/* Set LOOP_REG (rdx).  */
301	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
302#endif
303	/* Align dst for loop.  */
304	andq	$(VEC_SIZE * -2), %LOOP_REG
305	.p2align 4
306L(loop):
307	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
308	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
309	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
310	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
311	subq	$-(VEC_SIZE * 4), %LOOP_REG
312	cmpq	%END_REG, %LOOP_REG
313	jb	L(loop)
314	.p2align 4,, MOV_SIZE
315L(last_4x_vec):
316	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
317	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
318	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
319	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
320L(return):
321#if VEC_SIZE > 16
322	ZERO_UPPER_VEC_REGISTERS_RETURN
323#else
324	ret
325#endif
326
327	.p2align 4,, 10
328#ifndef USE_LESS_VEC_MASK_STORE
329# if defined USE_MULTIARCH && IS_IN (libc)
330	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
331	   range for 2-byte jump encoding.  */
332L(stosb_local):
333	movzbl	%sil, %eax
334	mov	%RDX_LP, %RCX_LP
335	mov	%RDI_LP, %RDX_LP
336	rep	stosb
337	mov	%RDX_LP, %RAX_LP
338	VZEROUPPER_RETURN
339# endif
340	/* Define L(less_vec) only if not otherwise defined.  */
341	.p2align 4
342L(less_vec):
343	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
344	   xmm). This is only does anything for AVX2.  */
345	MEMSET_VDUP_TO_VEC0_LOW ()
346L(less_vec_from_wmemset):
347#endif
348L(cross_page):
349#if VEC_SIZE > 32
350	cmpl	$32, %edx
351	jge	L(between_32_63)
352#endif
353#if VEC_SIZE > 16
354	cmpl	$16, %edx
355	jge	L(between_16_31)
356#endif
357#ifndef USE_XMM_LESS_VEC
358	MOVQ	%XMM0, %SET_REG64
359#endif
360	cmpl	$8, %edx
361	jge	L(between_8_15)
362	cmpl	$4, %edx
363	jge	L(between_4_7)
364	cmpl	$1, %edx
365	jg	L(between_2_3)
366	jl	L(between_0_0)
367	movb	%SET_REG8, (%LESS_VEC_REG)
368L(between_0_0):
369	ret
370
371	/* Align small targets only if not doing so would cross a fetch line.
372	 */
373#if VEC_SIZE > 32
374	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
375	/* From 32 to 63.  No branch when size == 32.  */
376L(between_32_63):
377	VMOVU	%YMM0, (%LESS_VEC_REG)
378	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
379	VZEROUPPER_RETURN
380#endif
381
382#if VEC_SIZE >= 32
383	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
384L(between_16_31):
385	/* From 16 to 31.  No branch when size == 16.  */
386	VMOVU	%XMM0, (%LESS_VEC_REG)
387	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
388	ret
389#endif
390
391	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
392	 */
393	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
394L(between_8_15):
395	/* From 8 to 15.  No branch when size == 8.  */
396#ifdef USE_XMM_LESS_VEC
397	MOVQ	%XMM0, (%rdi)
398	MOVQ	%XMM0, -8(%rdi, %rdx)
399#else
400	movq	%SET_REG64, (%LESS_VEC_REG)
401	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
402#endif
403	ret
404
405	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
406	 */
407	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
408L(between_4_7):
409	/* From 4 to 7.  No branch when size == 4.  */
410#ifdef USE_XMM_LESS_VEC
411	MOVD	%XMM0, (%rdi)
412	MOVD	%XMM0, -4(%rdi, %rdx)
413#else
414	movl	%SET_REG32, (%LESS_VEC_REG)
415	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
416#endif
417	ret
418
419	/* 4 * XMM_SMALL for the third mov for AVX2.  */
420	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
421L(between_2_3):
422	/* From 2 to 3.  No branch when size == 2.  */
423#ifdef USE_XMM_LESS_VEC
424	movb	%SET_REG8, (%rdi)
425	movb	%SET_REG8, 1(%rdi)
426	movb	%SET_REG8, -1(%rdi, %rdx)
427#else
428	movw	%SET_REG16, (%LESS_VEC_REG)
429	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
430#endif
431	ret
432END (MEMSET_SYMBOL (__memset, unaligned_erms))
433