1/* Copyright 2002 Andi Kleen, SuSE Labs */
2
3/*
4 * ISO C memset - set a memory block to a byte value.
5 *
6 * rdi   destination
7 * rsi   value (char)
8 * rdx   count (bytes)
9 *
10 * rax   original destination
11 */
12 	.globl __memset
13	.globl memset
14	.p2align 4
15memset:
16__memset:
17	movq %rdi,%r10
18	movq %rdx,%r11
19
20	/* expand byte value  */
21	movzbl %sil,%ecx
22	movabs $0x0101010101010101,%rax
23	mul    %rcx		/* with rax, clobbers rdx */
24
25	/* align dst */
26	movl  %edi,%r9d
27	andl  $7,%r9d
28	jnz  .Lbad_alignment
29.Lafter_bad_alignment:
30
31	movl %r11d,%ecx
32	shrl $6,%ecx
33	jz	 .Lhandle_tail
34
35	.p2align 4
36.Lloop_64:
37	decl   %ecx
38	movq  %rax,(%rdi)
39	movq  %rax,8(%rdi)
40	movq  %rax,16(%rdi)
41	movq  %rax,24(%rdi)
42	movq  %rax,32(%rdi)
43	movq  %rax,40(%rdi)
44	movq  %rax,48(%rdi)
45	movq  %rax,56(%rdi)
46	leaq  64(%rdi),%rdi
47	jnz    .Lloop_64
48
49	/* Handle tail in loops. The loops should be faster than hard
50	   to predict jump tables. */
51	.p2align 4
52.Lhandle_tail:
53	movl	%r11d,%ecx
54	andl    $63&(~7),%ecx
55	jz 		.Lhandle_7
56	shrl	$3,%ecx
57	.p2align 4
58.Lloop_8:
59	decl   %ecx
60	movq  %rax,(%rdi)
61	leaq  8(%rdi),%rdi
62	jnz    .Lloop_8
63
64.Lhandle_7:
65	movl	%r11d,%ecx
66	andl	$7,%ecx
67	jz      .Lende
68	.p2align 4
69.Lloop_1:
70	decl    %ecx
71	movb 	%al,(%rdi)
72	leaq	1(%rdi),%rdi
73	jnz     .Lloop_1
74
75.Lende:
76	movq	%r10,%rax
77	ret
78
79.Lbad_alignment:
80	cmpq $7,%r11
81	jbe	.Lhandle_7
82	movq %rax,(%rdi)	/* unaligned store */
83	movq $8,%r8
84	subq %r9,%r8
85	addq %r8,%rdi
86	subq %r8,%r11
87	jmp .Lafter_bad_alignment
88