1/* Optimized memset for PowerPC405,440,464 (32-byte cacheline).
2   Copyright (C) 2012-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* memset
22
23       r3:destination address and return address
24       r4:source integer to copy
25       r5:byte count
26       r11:sources integer to copy in all 32 bits of reg
27       r12:temp return address
28
29       Save return address in r12
30       If destinationn is unaligned and count is greater tha 255 bytes
31       set 0-3 bytes to make destination aligned
32       If count is greater tha 255 bytes and setting zero to memory
33       use dbcz to set memeory when we can
34       otherwsie do the follwoing
35       If 16 or more words to set we use 16 word copy loop.
36       Finaly we set 0-15 extra bytes with string store. */
37
38EALIGN (memset, 5, 0)
39       rlwinm  r11,r4,0,24,31
40       rlwimi  r11,r4,8,16,23
41       rlwimi  r11,r11,16,0,15
42       addi    r12,r3,0
43       cmpwi   r5,0x00FF
44       ble     L(preword8_count_loop)
45       cmpwi   r4,0x00
46       beq     L(use_dcbz)
47       neg     r6,r3
48       clrlwi. r6,r6,30
49       beq     L(preword8_count_loop)
50       addi    r8,0,1
51       mtctr   r6
52       subi    r3,r3,1
53
54L(unaligned_bytecopy_loop):
55       stbu    r11,0x1(r3)
56       subf.   r5,r8,r5
57       beq     L(end_memset)
58       bdnz    L(unaligned_bytecopy_loop)
59       addi    r3,r3,1
60
61L(preword8_count_loop):
62       srwi.   r6,r5,4
63       beq     L(preword2_count_loop)
64       mtctr   r6
65       addi    r3,r3,-4
66       mr      r8,r11
67       mr      r9,r11
68       mr      r10,r11
69
70L(word8_count_loop_no_dcbt):
71       stwu    r8,4(r3)
72       stwu    r9,4(r3)
73       subi    r5,r5,0x10
74       stwu    r10,4(r3)
75       stwu    r11,4(r3)
76       bdnz    L(word8_count_loop_no_dcbt)
77       addi    r3,r3,4
78
79L(preword2_count_loop):
80       clrlwi. r7,r5,28
81       beq     L(end_memset)
82       mr      r8,r11
83       mr      r9,r11
84       mr      r10,r11
85       mtxer   r7
86       stswx   r8,0,r3
87
88L(end_memset):
89       addi    r3,r12,0
90       blr
91
92L(use_dcbz):
93       neg     r6,r3
94       clrlwi. r7,r6,28
95       beq     L(skip_string_loop)
96       mr      r8,r11
97       mr      r9,r11
98       mr      r10,r11
99       subf    r5,r7,r5
100       mtxer   r7
101       stswx   r8,0,r3
102       add     r3,r3,r7
103
104L(skip_string_loop):
105       clrlwi  r8,r6,27
106       srwi.   r8,r8,4
107       beq     L(dcbz_pre_loop)
108       mtctr   r8
109
110L(word_loop):
111       stw     r11,0(r3)
112       subi    r5,r5,0x10
113       stw     r11,4(r3)
114       stw     r11,8(r3)
115       stw     r11,12(r3)
116       addi    r3,r3,0x10
117       bdnz    L(word_loop)
118
119L(dcbz_pre_loop):
120       srwi    r6,r5,5
121       mtctr   r6
122       addi    r7,0,0
123
124L(dcbz_loop):
125       dcbz    r3,r7
126       addi    r3,r3,0x20
127       subi    r5,r5,0x20
128       bdnz    L(dcbz_loop)
129       srwi.   r6,r5,4
130       beq     L(postword2_count_loop)
131       mtctr   r6
132
133L(postword8_count_loop):
134       stw     r11,0(r3)
135       subi    r5,r5,0x10
136       stw     r11,4(r3)
137       stw     r11,8(r3)
138       stw     r11,12(r3)
139       addi    r3,r3,0x10
140       bdnz    L(postword8_count_loop)
141
142L(postword2_count_loop):
143       clrlwi. r7,r5,28
144       beq     L(end_memset)
145       mr      r8,r11
146       mr      r9,r11
147       mr      r10,r11
148       mtxer   r7
149       stswx   r8,0,r3
150       b       L(end_memset)
151END (memset)
152libc_hidden_builtin_def (memset)
153