1/* Optimized memset implementation for PowerPC64. 2 Copyright (C) 1997-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. 23 24 The memset is done in three sizes: byte (8 bits), word (32 bits), 25 cache line (256 bits). There is a special case for setting cache lines 26 to 0, to take advantage of the dcbz instruction. */ 27 28#ifndef MEMSET 29# define MEMSET memset 30#endif 31 .machine power4 32ENTRY_TOCLESS (MEMSET, 5) 33 CALL_MCOUNT 3 34 35#define rTMP r0 36#define rRTN r3 /* Initial value of 1st argument. */ 37#define rMEMP0 r3 /* Original value of 1st arg. */ 38#define rCHR r4 /* Char to set in each byte. */ 39#define rLEN r5 /* Length of region to set. */ 40#define rMEMP r6 /* Address at which we are storing. */ 41#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ 42#define rMEMP2 r8 43 44#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ 45#define rCLS r8 /* Cache line size obtained from static. */ 46#define rCLM r9 /* Cache line size mask to check for cache alignment. */ 47L(_memset): 48/* Take care of case for size <= 4. */ 49 cmpldi cr1, rLEN, 8 50 andi. rALIGN, rMEMP0, 7 51 mr rMEMP, rMEMP0 52 ble- cr1, L(small) 53 54/* Align to doubleword boundary. */ 55 cmpldi cr5, rLEN, 31 56 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ 57 beq+ L(aligned2) 58 mtcrf 0x01, rMEMP0 59 subfic rALIGN, rALIGN, 8 60 cror 28,30,31 /* Detect odd word aligned. */ 61 add rMEMP, rMEMP, rALIGN 62 sub rLEN, rLEN, rALIGN 63 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 64 bt 29, L(g4) 65/* Process the even word of doubleword. */ 66 bf+ 31, L(g2) 67 stb rCHR, 0(rMEMP0) 68 bt 30, L(g4x) 69L(g2): 70 sth rCHR, -6(rMEMP) 71L(g4x): 72 stw rCHR, -4(rMEMP) 73 b L(aligned) 74/* Process the odd word of doubleword. */ 75L(g4): 76 bf 28, L(g4x) /* If false, word aligned on odd word. */ 77 bf+ 31, L(g0) 78 stb rCHR, 0(rMEMP0) 79 bt 30, L(aligned) 80L(g0): 81 sth rCHR, -2(rMEMP) 82 83/* Handle the case of size < 31. */ 84L(aligned2): 85 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 86L(aligned): 87 mtcrf 0x01, rLEN 88 ble cr5, L(medium) 89/* Align to 32-byte boundary. */ 90 andi. rALIGN, rMEMP, 0x18 91 subfic rALIGN, rALIGN, 0x20 92 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 93 beq L(caligned) 94 mtcrf 0x01, rALIGN 95 add rMEMP, rMEMP, rALIGN 96 sub rLEN, rLEN, rALIGN 97 cmplwi cr1, rALIGN, 0x10 98 mr rMEMP2, rMEMP 99 bf 28, L(a1) 100 stdu rCHR, -8(rMEMP2) 101L(a1): blt cr1, L(a2) 102 std rCHR, -8(rMEMP2) 103 stdu rCHR, -16(rMEMP2) 104L(a2): 105 106/* Now aligned to a 32 byte boundary. */ 107L(caligned): 108 cmpldi cr1, rCHR, 0 109 clrrdi. rALIGN, rLEN, 5 110 mtcrf 0x01, rLEN 111 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ 112L(nondcbz): 113 srdi rTMP, rALIGN, 5 114 mtctr rTMP 115 beq L(medium) /* We may not actually get to do a full line. */ 116 clrldi. rLEN, rLEN, 59 117 add rMEMP, rMEMP, rALIGN 118 li rNEG64, -0x40 119 bdz L(cloopdone) 120 121L(c3): dcbtst rNEG64, rMEMP 122 std rCHR, -8(rMEMP) 123 std rCHR, -16(rMEMP) 124 std rCHR, -24(rMEMP) 125 stdu rCHR, -32(rMEMP) 126 bdnz L(c3) 127L(cloopdone): 128 std rCHR, -8(rMEMP) 129 std rCHR, -16(rMEMP) 130 cmpldi cr1, rLEN, 16 131 std rCHR, -24(rMEMP) 132 stdu rCHR, -32(rMEMP) 133 beqlr 134 add rMEMP, rMEMP, rALIGN 135 b L(medium_tail2) 136 137 .align 5 138/* Clear lines of memory in 128-byte chunks. */ 139L(zloopstart): 140/* If the remaining length is less the 32 bytes, don't bother getting 141 the cache line size. */ 142 beq L(medium) 143 li rCLS,128 /* cache line size is 128 */ 144 145/* Now we know the cache line size, and it is not 32-bytes, but 146 we may not yet be aligned to the cache line. May have a partial 147 line to fill, so touch it 1st. */ 148 dcbt 0,rMEMP 149L(getCacheAligned): 150 cmpldi cr1,rLEN,32 151 andi. rTMP,rMEMP,127 152 blt cr1,L(handletail32) 153 beq L(cacheAligned) 154 addi rMEMP,rMEMP,32 155 addi rLEN,rLEN,-32 156 std rCHR,-32(rMEMP) 157 std rCHR,-24(rMEMP) 158 std rCHR,-16(rMEMP) 159 std rCHR,-8(rMEMP) 160 b L(getCacheAligned) 161 162/* Now we are aligned to the cache line and can use dcbz. */ 163L(cacheAligned): 164 cmpld cr1,rLEN,rCLS 165 blt cr1,L(handletail32) 166 dcbz 0,rMEMP 167 subf rLEN,rCLS,rLEN 168 add rMEMP,rMEMP,rCLS 169 b L(cacheAligned) 170 171/* We are here because the cache line size was set and was not 32-bytes 172 and the remainder (rLEN) is less than the actual cache line size. 173 So set up the preconditions for L(nondcbz) and go there. */ 174L(handletail32): 175 clrrwi. rALIGN, rLEN, 5 176 b L(nondcbz) 177 178 .align 5 179L(small): 180/* Memset of 8 bytes or less. */ 181 cmpldi cr6, rLEN, 4 182 cmpldi cr5, rLEN, 1 183 ble cr6,L(le4) 184 subi rLEN, rLEN, 4 185 stb rCHR,0(rMEMP) 186 stb rCHR,1(rMEMP) 187 stb rCHR,2(rMEMP) 188 stb rCHR,3(rMEMP) 189 addi rMEMP,rMEMP, 4 190 cmpldi cr5, rLEN, 1 191L(le4): 192 cmpldi cr1, rLEN, 3 193 bltlr cr5 194 stb rCHR, 0(rMEMP) 195 beqlr cr5 196 stb rCHR, 1(rMEMP) 197 bltlr cr1 198 stb rCHR, 2(rMEMP) 199 beqlr cr1 200 stb rCHR, 3(rMEMP) 201 blr 202 203/* Memset of 0-31 bytes. */ 204 .align 5 205L(medium): 206 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 207 cmpldi cr1, rLEN, 16 208L(medium_tail2): 209 add rMEMP, rMEMP, rLEN 210L(medium_tail): 211 bt- 31, L(medium_31t) 212 bt- 30, L(medium_30t) 213L(medium_30f): 214 bt- 29, L(medium_29t) 215L(medium_29f): 216 bge- cr1, L(medium_27t) 217 bflr- 28 218 std rCHR, -8(rMEMP) 219 blr 220 221L(medium_31t): 222 stbu rCHR, -1(rMEMP) 223 bf- 30, L(medium_30f) 224L(medium_30t): 225 sthu rCHR, -2(rMEMP) 226 bf- 29, L(medium_29f) 227L(medium_29t): 228 stwu rCHR, -4(rMEMP) 229 blt- cr1, L(medium_27f) 230L(medium_27t): 231 std rCHR, -8(rMEMP) 232 stdu rCHR, -16(rMEMP) 233L(medium_27f): 234 bflr- 28 235L(medium_28t): 236 std rCHR, -8(rMEMP) 237 blr 238END_GEN_TB (MEMSET,TB_TOCLESS) 239libc_hidden_builtin_def (memset) 240