1/* Optimized memset implementation for PowerPC64. 2 Copyright (C) 1997-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. 23 24 The memset is done in three sizes: byte (8 bits), word (32 bits), 25 cache line (1024 bits). There is a special case for setting cache lines 26 to 0, to take advantage of the dcbz instruction. */ 27 28 .machine power4 29EALIGN (memset, 5, 0) 30 CALL_MCOUNT 31 32#define rTMP r0 33#define rRTN r3 /* Initial value of 1st argument. */ 34#define rMEMP0 r3 /* Original value of 1st arg. */ 35#define rCHR r4 /* Char to set in each byte. */ 36#define rLEN r5 /* Length of region to set. */ 37#define rMEMP r6 /* Address at which we are storing. */ 38#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ 39#define rMEMP2 r8 40 41#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ 42#define rCLS r8 /* Cache line size (known to be 128). */ 43#define rCLM r9 /* Cache line size mask to check for cache alignment. */ 44L(_memset): 45/* Take care of case for size <= 4. */ 46 cmplwi cr1, rLEN, 4 47 andi. rALIGN, rMEMP0, 3 48 mr rMEMP, rMEMP0 49 ble- cr1, L(small) 50 51/* Align to word boundary. */ 52 cmplwi cr5, rLEN, 31 53 insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */ 54 beq+ L(aligned) 55 mtcrf 0x01, rMEMP0 56 subfic rALIGN, rALIGN, 4 57 add rMEMP, rMEMP, rALIGN 58 sub rLEN, rLEN, rALIGN 59 bf+ 31, L(g0) 60 stb rCHR, 0(rMEMP0) 61 bt 30, L(aligned) 62L(g0): 63 sth rCHR, -2(rMEMP) 64 65/* Handle the case of size < 31. */ 66L(aligned): 67 mtcrf 0x01, rLEN 68 insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */ 69 ble cr5, L(medium) 70/* Align to 32-byte boundary. */ 71 andi. rALIGN, rMEMP, 0x1C 72 subfic rALIGN, rALIGN, 0x20 73 beq L(caligned) 74 mtcrf 0x01, rALIGN 75 add rMEMP, rMEMP, rALIGN 76 sub rLEN, rLEN, rALIGN 77 cmplwi cr1, rALIGN, 0x10 78 mr rMEMP2, rMEMP 79 bf 28, L(a1) 80 stw rCHR, -4(rMEMP2) 81 stwu rCHR, -8(rMEMP2) 82L(a1): blt cr1, L(a2) 83 stw rCHR, -4(rMEMP2) 84 stw rCHR, -8(rMEMP2) 85 stw rCHR, -12(rMEMP2) 86 stwu rCHR, -16(rMEMP2) 87L(a2): bf 29, L(caligned) 88 stw rCHR, -4(rMEMP2) 89 90/* Now aligned to a 32 byte boundary. */ 91L(caligned): 92 cmplwi cr1, rCHR, 0 93 clrrwi. rALIGN, rLEN, 5 94 mtcrf 0x01, rLEN 95 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ 96L(nondcbz): 97 srwi rTMP, rALIGN, 5 98 mtctr rTMP 99 beq L(medium) /* We may not actually get to do a full line. */ 100 clrlwi. rLEN, rLEN, 27 101 add rMEMP, rMEMP, rALIGN 102 li rNEG64, -0x40 103 bdz L(cloopdone) 104 105 .align 4 106L(c3): dcbtst rNEG64, rMEMP 107 stw rCHR, -4(rMEMP) 108 stw rCHR, -8(rMEMP) 109 stw rCHR, -12(rMEMP) 110 stw rCHR, -16(rMEMP) 111 stw rCHR, -20(rMEMP) 112 stw rCHR, -24(rMEMP) 113 stw rCHR, -28(rMEMP) 114 stwu rCHR, -32(rMEMP) 115 bdnz L(c3) 116L(cloopdone): 117 stw rCHR, -4(rMEMP) 118 stw rCHR, -8(rMEMP) 119 stw rCHR, -12(rMEMP) 120 stw rCHR, -16(rMEMP) 121 cmplwi cr1, rLEN, 16 122 stw rCHR, -20(rMEMP) 123 stw rCHR, -24(rMEMP) 124 stw rCHR, -28(rMEMP) 125 stwu rCHR, -32(rMEMP) 126 beqlr 127 add rMEMP, rMEMP, rALIGN 128 b L(medium_tail2) 129 130 .align 5 131/* Clear lines of memory in 128-byte chunks. */ 132L(zloopstart): 133/* If the remaining length is less the 32 bytes, don't bother getting 134 the cache line size. */ 135 beq L(medium) 136 li rCLS,128 /* cache line size is 128 */ 137 dcbt 0,rMEMP 138L(getCacheAligned): 139 cmplwi cr1,rLEN,32 140 andi. rTMP,rMEMP,127 141 blt cr1,L(handletail32) 142 beq L(cacheAligned) 143 addi rMEMP,rMEMP,32 144 addi rLEN,rLEN,-32 145 stw rCHR,-32(rMEMP) 146 stw rCHR,-28(rMEMP) 147 stw rCHR,-24(rMEMP) 148 stw rCHR,-20(rMEMP) 149 stw rCHR,-16(rMEMP) 150 stw rCHR,-12(rMEMP) 151 stw rCHR,-8(rMEMP) 152 stw rCHR,-4(rMEMP) 153 b L(getCacheAligned) 154 155/* Now we are aligned to the cache line and can use dcbz. */ 156 .align 4 157L(cacheAligned): 158 cmplw cr1,rLEN,rCLS 159 blt cr1,L(handletail32) 160 dcbz 0,rMEMP 161 subf rLEN,rCLS,rLEN 162 add rMEMP,rMEMP,rCLS 163 b L(cacheAligned) 164 165/* We are here because the cache line size was set and the remainder 166 (rLEN) is less than the actual cache line size. 167 So set up the preconditions for L(nondcbz) and go there. */ 168L(handletail32): 169 clrrwi. rALIGN, rLEN, 5 170 b L(nondcbz) 171 172 .align 5 173L(small): 174/* Memset of 4 bytes or less. */ 175 cmplwi cr5, rLEN, 1 176 cmplwi cr1, rLEN, 3 177 bltlr cr5 178 stb rCHR, 0(rMEMP) 179 beqlr cr5 180 stb rCHR, 1(rMEMP) 181 bltlr cr1 182 stb rCHR, 2(rMEMP) 183 beqlr cr1 184 stb rCHR, 3(rMEMP) 185 blr 186 187/* Memset of 0-31 bytes. */ 188 .align 5 189L(medium): 190 cmplwi cr1, rLEN, 16 191L(medium_tail2): 192 add rMEMP, rMEMP, rLEN 193L(medium_tail): 194 bt- 31, L(medium_31t) 195 bt- 30, L(medium_30t) 196L(medium_30f): 197 bt- 29, L(medium_29t) 198L(medium_29f): 199 bge- cr1, L(medium_27t) 200 bflr- 28 201 stw rCHR, -4(rMEMP) 202 stw rCHR, -8(rMEMP) 203 blr 204 205L(medium_31t): 206 stbu rCHR, -1(rMEMP) 207 bf- 30, L(medium_30f) 208L(medium_30t): 209 sthu rCHR, -2(rMEMP) 210 bf- 29, L(medium_29f) 211L(medium_29t): 212 stwu rCHR, -4(rMEMP) 213 blt- cr1, L(medium_27f) 214L(medium_27t): 215 stw rCHR, -4(rMEMP) 216 stw rCHR, -8(rMEMP) 217 stw rCHR, -12(rMEMP) 218 stwu rCHR, -16(rMEMP) 219L(medium_27f): 220 bflr- 28 221L(medium_28t): 222 stw rCHR, -4(rMEMP) 223 stw rCHR, -8(rMEMP) 224 blr 225END (memset) 226libc_hidden_builtin_def (memset) 227