1/* Optimized memset implementation for PowerPC64. 2 Copyright (C) 1997-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <rtld-global-offsets.h> 21 22 .section ".toc","aw" 23__GLRO_DEF(dl_cache_line_size) 24 25 .section ".text" 26 .align 2 27 28/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 29 Returns 's'. 30 31 The memset is done in three sizes: byte (8 bits), word (32 bits), 32 cache line (256 bits). There is a special case for setting cache lines 33 to 0, to take advantage of the dcbz instruction. */ 34 35#ifndef MEMSET 36# define MEMSET memset 37#endif 38 39ENTRY (MEMSET, 5) 40 CALL_MCOUNT 3 41 42#define rTMP r0 43#define rRTN r3 /* Initial value of 1st argument. */ 44#define rMEMP0 r3 /* Original value of 1st arg. */ 45#define rCHR r4 /* Char to set in each byte. */ 46#define rLEN r5 /* Length of region to set. */ 47#define rMEMP r6 /* Address at which we are storing. */ 48#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ 49#define rMEMP2 r8 50 51#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ 52#define rCLS r8 /* Cache line size obtained from static. */ 53#define rCLM r9 /* Cache line size mask to check for cache alignment. */ 54L(_memset): 55/* Take care of case for size <= 4. */ 56 cmpldi cr1, rLEN, 8 57 andi. rALIGN, rMEMP0, 7 58 mr rMEMP, rMEMP0 59 ble- cr1, L(small) 60 61/* Align to doubleword boundary. */ 62 cmpldi cr5, rLEN, 31 63 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ 64 beq+ L(aligned2) 65 mtcrf 0x01, rMEMP0 66 subfic rALIGN, rALIGN, 8 67 cror 28,30,31 /* Detect odd word aligned. */ 68 add rMEMP, rMEMP, rALIGN 69 sub rLEN, rLEN, rALIGN 70 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 71 bt 29, L(g4) 72/* Process the even word of doubleword. */ 73 bf+ 31, L(g2) 74 stb rCHR, 0(rMEMP0) 75 bt 30, L(g4x) 76L(g2): 77 sth rCHR, -6(rMEMP) 78L(g4x): 79 stw rCHR, -4(rMEMP) 80 b L(aligned) 81/* Process the odd word of doubleword. */ 82L(g4): 83 bf 28, L(g4x) /* If false, word aligned on odd word. */ 84 bf+ 31, L(g0) 85 stb rCHR, 0(rMEMP0) 86 bt 30, L(aligned) 87L(g0): 88 sth rCHR, -2(rMEMP) 89 90/* Handle the case of size < 31. */ 91L(aligned2): 92 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 93L(aligned): 94 mtcrf 0x01, rLEN 95 ble cr5, L(medium) 96/* Align to 32-byte boundary. */ 97 andi. rALIGN, rMEMP, 0x18 98 subfic rALIGN, rALIGN, 0x20 99 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 100 beq L(caligned) 101 mtcrf 0x01, rALIGN 102 add rMEMP, rMEMP, rALIGN 103 sub rLEN, rLEN, rALIGN 104 cmplwi cr1, rALIGN, 0x10 105 mr rMEMP2, rMEMP 106 bf 28, L(a1) 107 stdu rCHR, -8(rMEMP2) 108L(a1): blt cr1, L(a2) 109 std rCHR, -8(rMEMP2) 110 stdu rCHR, -16(rMEMP2) 111L(a2): 112 113/* Now aligned to a 32 byte boundary. */ 114L(caligned): 115 cmpldi cr1, rCHR, 0 116 clrrdi. rALIGN, rLEN, 5 117 mtcrf 0x01, rLEN 118 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ 119L(nondcbz): 120 srdi rTMP, rALIGN, 5 121 mtctr rTMP 122 beq L(medium) /* We may not actually get to do a full line. */ 123 clrldi. rLEN, rLEN, 59 124 add rMEMP, rMEMP, rALIGN 125 li rNEG64, -0x40 126 bdz L(cloopdone) 127 128L(c3): dcbtst rNEG64, rMEMP 129 std rCHR, -8(rMEMP) 130 std rCHR, -16(rMEMP) 131 std rCHR, -24(rMEMP) 132 stdu rCHR, -32(rMEMP) 133 bdnz L(c3) 134L(cloopdone): 135 std rCHR, -8(rMEMP) 136 std rCHR, -16(rMEMP) 137 cmpldi cr1, rLEN, 16 138 std rCHR, -24(rMEMP) 139 stdu rCHR, -32(rMEMP) 140 beqlr 141 add rMEMP, rMEMP, rALIGN 142 b L(medium_tail2) 143 144 .align 5 145/* Clear lines of memory in 128-byte chunks. */ 146L(zloopstart): 147/* If the remaining length is less the 32 bytes, don't bother getting 148 the cache line size. */ 149 beq L(medium) 150 /* Read the cache line size. */ 151 __GLRO (rCLS, dl_cache_line_size, 152 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) 153 154/* If the cache line size was not set just goto to L(nondcbz) which is 155 safe for any cache line size. */ 156 cmpldi cr1,rCLS,0 157 beq cr1,L(nondcbz) 158 159 160/* Now we know the cache line size, and it is not 32-bytes, but 161 we may not yet be aligned to the cache line. May have a partial 162 line to fill, so touch it 1st. */ 163 dcbt 0,rMEMP 164 addi rCLM,rCLS,-1 165L(getCacheAligned): 166 cmpldi cr1,rLEN,32 167 and. rTMP,rCLM,rMEMP 168 blt cr1,L(handletail32) 169 beq L(cacheAligned) 170 addi rMEMP,rMEMP,32 171 addi rLEN,rLEN,-32 172 std rCHR,-32(rMEMP) 173 std rCHR,-24(rMEMP) 174 std rCHR,-16(rMEMP) 175 std rCHR,-8(rMEMP) 176 b L(getCacheAligned) 177 178/* Now we are aligned to the cache line and can use dcbz. */ 179L(cacheAligned): 180 cmpld cr1,rLEN,rCLS 181 blt cr1,L(handletail32) 182 dcbz 0,rMEMP 183 subf rLEN,rCLS,rLEN 184 add rMEMP,rMEMP,rCLS 185 b L(cacheAligned) 186 187/* We are here because the cache line size was set and was not 32-bytes 188 and the remainder (rLEN) is less than the actual cache line size. 189 So set up the preconditions for L(nondcbz) and go there. */ 190L(handletail32): 191 clrrwi. rALIGN, rLEN, 5 192 b L(nondcbz) 193 194 .align 5 195L(small): 196/* Memset of 8 bytes or less. */ 197 cmpldi cr6, rLEN, 4 198 cmpldi cr5, rLEN, 1 199 ble cr6,L(le4) 200 subi rLEN, rLEN, 4 201 stb rCHR,0(rMEMP) 202 stb rCHR,1(rMEMP) 203 stb rCHR,2(rMEMP) 204 stb rCHR,3(rMEMP) 205 addi rMEMP,rMEMP, 4 206 cmpldi cr5, rLEN, 1 207L(le4): 208 cmpldi cr1, rLEN, 3 209 bltlr cr5 210 stb rCHR, 0(rMEMP) 211 beqlr cr5 212 stb rCHR, 1(rMEMP) 213 bltlr cr1 214 stb rCHR, 2(rMEMP) 215 beqlr cr1 216 stb rCHR, 3(rMEMP) 217 blr 218 219/* Memset of 0-31 bytes. */ 220 .align 5 221L(medium): 222 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 223 cmpldi cr1, rLEN, 16 224L(medium_tail2): 225 add rMEMP, rMEMP, rLEN 226L(medium_tail): 227 bt- 31, L(medium_31t) 228 bt- 30, L(medium_30t) 229L(medium_30f): 230 bt- 29, L(medium_29t) 231L(medium_29f): 232 bge- cr1, L(medium_27t) 233 bflr- 28 234 std rCHR, -8(rMEMP) 235 blr 236 237L(medium_31t): 238 stbu rCHR, -1(rMEMP) 239 bf- 30, L(medium_30f) 240L(medium_30t): 241 sthu rCHR, -2(rMEMP) 242 bf- 29, L(medium_29f) 243L(medium_29t): 244 stwu rCHR, -4(rMEMP) 245 blt- cr1, L(medium_27f) 246L(medium_27t): 247 std rCHR, -8(rMEMP) 248 stdu rCHR, -16(rMEMP) 249L(medium_27f): 250 bflr- 28 251L(medium_28t): 252 std rCHR, -8(rMEMP) 253 blr 254END_GEN_TB (MEMSET,TB_TOCLESS) 255libc_hidden_builtin_def (memset) 256