1/* Optimized 64-bit memset implementation for POWER6. 2 Copyright (C) 1997-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. 23 24 The memset is done in three sizes: byte (8 bits), word (32 bits), 25 cache line (256 bits). There is a special case for setting cache lines 26 to 0, to take advantage of the dcbz instruction. */ 27 28#ifndef MEMSET 29# define MEMSET memset 30#endif 31 .machine power6 32ENTRY_TOCLESS (MEMSET, 7) 33 CALL_MCOUNT 3 34 35#define rTMP r0 36#define rRTN r3 /* Initial value of 1st argument. */ 37#define rMEMP0 r3 /* Original value of 1st arg. */ 38#define rCHR r4 /* Char to set in each byte. */ 39#define rLEN r5 /* Length of region to set. */ 40#define rMEMP r6 /* Address at which we are storing. */ 41#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ 42#define rMEMP2 r8 43#define rMEMP3 r9 /* Alt mem pointer. */ 44L(_memset): 45/* Take care of case for size <= 4. */ 46 cmpldi cr1, rLEN, 8 47 andi. rALIGN, rMEMP0, 7 48 mr rMEMP, rMEMP0 49 ble cr1, L(small) 50 51/* Align to doubleword boundary. */ 52 cmpldi cr5, rLEN, 31 53 insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ 54 beq+ L(aligned2) 55 mtcrf 0x01, rMEMP0 56 subfic rALIGN, rALIGN, 8 57 cror 28,30,31 /* Detect odd word aligned. */ 58 add rMEMP, rMEMP, rALIGN 59 sub rLEN, rLEN, rALIGN 60 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 61 bt 29, L(g4) 62/* Process the even word of doubleword. */ 63 bf+ 31, L(g2) 64 stb rCHR, 0(rMEMP0) 65 bt 30, L(g4x) 66L(g2): 67 sth rCHR, -6(rMEMP) 68L(g4x): 69 stw rCHR, -4(rMEMP) 70 b L(aligned) 71/* Process the odd word of doubleword. */ 72L(g4): 73 bf 28, L(g4x) /* If false, word aligned on odd word. */ 74 bf+ 31, L(g0) 75 stb rCHR, 0(rMEMP0) 76 bt 30, L(aligned) 77L(g0): 78 sth rCHR, -2(rMEMP) 79 80/* Handle the case of size < 31. */ 81L(aligned2): 82 insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ 83L(aligned): 84 mtcrf 0x01, rLEN 85 ble cr5, L(medium) 86/* Align to 32-byte boundary. */ 87 andi. rALIGN, rMEMP, 0x18 88 subfic rALIGN, rALIGN, 0x20 89 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 90 beq L(caligned) 91 mtcrf 0x01, rALIGN 92 add rMEMP, rMEMP, rALIGN 93 sub rLEN, rLEN, rALIGN 94 cmplwi cr1, rALIGN, 0x10 95 mr rMEMP2, rMEMP 96 bf 28, L(a1) 97 stdu rCHR, -8(rMEMP2) 98L(a1): blt cr1, L(a2) 99 std rCHR, -8(rMEMP2) 100 stdu rCHR, -16(rMEMP2) 101L(a2): 102 103/* Now aligned to a 32 byte boundary. */ 104 .align 4 105L(caligned): 106 cmpldi cr1, rCHR, 0 107 clrrdi. rALIGN, rLEN, 5 108 mtcrf 0x01, rLEN 109 beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ 110 beq L(medium) /* We may not actually get to do a full line. */ 111 .align 4 112/* Storing a non-zero "c" value. We are aligned at a sector (32-byte) 113 boundary may not be at cache line (128-byte) boundary. */ 114L(nzloopstart): 115/* memset in 32-byte chunks until we get to a cache line boundary. 116 If rLEN is less than the distance to the next cache-line boundary use 117 cacheAligned1 code to finish the tail. */ 118 cmpldi cr1,rLEN,128 119 120 andi. rTMP,rMEMP,127 121 blt cr1,L(cacheAligned1) 122 addi rMEMP3,rMEMP,32 123 beq L(nzCacheAligned) 124 addi rLEN,rLEN,-32 125 std rCHR,0(rMEMP) 126 std rCHR,8(rMEMP) 127 std rCHR,16(rMEMP) 128 addi rMEMP,rMEMP,32 129 andi. rTMP,rMEMP3,127 130 std rCHR,-8(rMEMP3) 131 132 beq L(nzCacheAligned) 133 addi rLEN,rLEN,-32 134 std rCHR,0(rMEMP3) 135 addi rMEMP,rMEMP,32 136 std rCHR,8(rMEMP3) 137 andi. rTMP,rMEMP,127 138 std rCHR,16(rMEMP3) 139 std rCHR,24(rMEMP3) 140 141 beq L(nzCacheAligned) 142 addi rLEN,rLEN,-32 143 std rCHR,32(rMEMP3) 144 addi rMEMP,rMEMP,32 145 cmpldi cr1,rLEN,128 146 std rCHR,40(rMEMP3) 147 cmpldi cr6,rLEN,256 148 li rMEMP2,128 149 std rCHR,48(rMEMP3) 150 std rCHR,56(rMEMP3) 151 blt cr1,L(cacheAligned1) 152 b L(nzCacheAligned128) 153 154/* Now we are aligned to the cache line and can use dcbtst. */ 155 .align 4 156L(nzCacheAligned): 157 cmpldi cr1,rLEN,128 158 blt cr1,L(cacheAligned1) 159 b L(nzCacheAligned128) 160 .align 5 161L(nzCacheAligned128): 162 cmpldi cr1,rLEN,256 163 addi rMEMP3,rMEMP,64 164 std rCHR,0(rMEMP) 165 std rCHR,8(rMEMP) 166 std rCHR,16(rMEMP) 167 std rCHR,24(rMEMP) 168 std rCHR,32(rMEMP) 169 std rCHR,40(rMEMP) 170 std rCHR,48(rMEMP) 171 std rCHR,56(rMEMP) 172 addi rMEMP,rMEMP3,64 173 addi rLEN,rLEN,-128 174 std rCHR,0(rMEMP3) 175 std rCHR,8(rMEMP3) 176 std rCHR,16(rMEMP3) 177 std rCHR,24(rMEMP3) 178 std rCHR,32(rMEMP3) 179 std rCHR,40(rMEMP3) 180 std rCHR,48(rMEMP3) 181 std rCHR,56(rMEMP3) 182 bge cr1,L(nzCacheAligned128) 183 dcbtst 0,rMEMP 184 b L(cacheAligned1) 185 .align 5 186/* Storing a zero "c" value. We are aligned at a sector (32-byte) 187 boundary but may not be at cache line (128-byte) boundary. If the 188 remaining length spans a full cache line we can use the Data cache 189 block zero instruction. */ 190L(zloopstart): 191/* memset in 32-byte chunks until we get to a cache line boundary. 192 If rLEN is less than the distance to the next cache-line boundary use 193 cacheAligned1 code to finish the tail. */ 194 cmpldi cr1,rLEN,128 195 beq L(medium) 196L(getCacheAligned): 197 andi. rTMP,rMEMP,127 198 nop 199 blt cr1,L(cacheAligned1) 200 addi rMEMP3,rMEMP,32 201 beq L(cacheAligned) 202 addi rLEN,rLEN,-32 203 std rCHR,0(rMEMP) 204 std rCHR,8(rMEMP) 205 std rCHR,16(rMEMP) 206 addi rMEMP,rMEMP,32 207 andi. rTMP,rMEMP3,127 208 std rCHR,-8(rMEMP3) 209L(getCacheAligned2): 210 beq L(cacheAligned) 211 addi rLEN,rLEN,-32 212 std rCHR,0(rMEMP3) 213 std rCHR,8(rMEMP3) 214 addi rMEMP,rMEMP,32 215 andi. rTMP,rMEMP,127 216 std rCHR,16(rMEMP3) 217 std rCHR,24(rMEMP3) 218L(getCacheAligned3): 219 beq L(cacheAligned) 220 addi rLEN,rLEN,-32 221 std rCHR,32(rMEMP3) 222 addi rMEMP,rMEMP,32 223 cmpldi cr1,rLEN,128 224 std rCHR,40(rMEMP3) 225 cmpldi cr6,rLEN,256 226 li rMEMP2,128 227 std rCHR,48(rMEMP3) 228 std rCHR,56(rMEMP3) 229 blt cr1,L(cacheAligned1) 230 blt cr6,L(cacheAligned128) 231 b L(cacheAlignedx) 232 233/* Now we are aligned to the cache line and can use dcbz. */ 234 .align 5 235L(cacheAligned): 236 cmpldi cr1,rLEN,128 237 cmpldi cr6,rLEN,256 238 blt cr1,L(cacheAligned1) 239 li rMEMP2,128 240L(cacheAlignedx): 241 cmpldi cr5,rLEN,640 242 blt cr6,L(cacheAligned128) 243 bgt cr5,L(cacheAligned512) 244 cmpldi cr6,rLEN,512 245 dcbz 0,rMEMP 246 cmpldi cr1,rLEN,384 247 dcbz rMEMP2,rMEMP 248 addi rMEMP,rMEMP,256 249 addi rLEN,rLEN,-256 250 blt cr1,L(cacheAligned1) 251 blt cr6,L(cacheAligned128) 252 b L(cacheAligned256) 253 .align 5 254/* A simple loop for the longer (>640 bytes) lengths. This form limits 255 the branch miss-predicted to exactly 1 at loop exit.*/ 256L(cacheAligned512): 257 cmpldi cr1,rLEN,128 258 blt cr1,L(cacheAligned1) 259 dcbz 0,rMEMP 260 addi rLEN,rLEN,-128 261 addi rMEMP,rMEMP,128 262 b L(cacheAligned512) 263 .align 5 264L(cacheAligned256): 265 266 cmpldi cr6,rLEN,512 267 268 dcbz 0,rMEMP 269 cmpldi cr1,rLEN,384 270 dcbz rMEMP2,rMEMP 271 addi rMEMP,rMEMP,256 272 addi rLEN,rLEN,-256 273 274 bge cr6,L(cacheAligned256) 275 276 blt cr1,L(cacheAligned1) 277 .align 4 278L(cacheAligned128): 279 dcbz 0,rMEMP 280 addi rMEMP,rMEMP,128 281 addi rLEN,rLEN,-128 282 nop 283L(cacheAligned1): 284 cmpldi cr1,rLEN,32 285 blt cr1,L(handletail32) 286 addi rMEMP3,rMEMP,32 287 addi rLEN,rLEN,-32 288 std rCHR,0(rMEMP) 289 std rCHR,8(rMEMP) 290 std rCHR,16(rMEMP) 291 addi rMEMP,rMEMP,32 292 cmpldi cr1,rLEN,32 293 std rCHR,-8(rMEMP3) 294L(cacheAligned2): 295 blt cr1,L(handletail32) 296 addi rLEN,rLEN,-32 297 std rCHR,0(rMEMP3) 298 std rCHR,8(rMEMP3) 299 addi rMEMP,rMEMP,32 300 cmpldi cr1,rLEN,32 301 std rCHR,16(rMEMP3) 302 std rCHR,24(rMEMP3) 303 nop 304L(cacheAligned3): 305 blt cr1,L(handletail32) 306 addi rMEMP,rMEMP,32 307 addi rLEN,rLEN,-32 308 std rCHR,32(rMEMP3) 309 std rCHR,40(rMEMP3) 310 std rCHR,48(rMEMP3) 311 std rCHR,56(rMEMP3) 312 313/* We are here because the length or remainder (rLEN) is less than the 314 cache line/sector size and does not justify aggressive loop unrolling. 315 So set up the preconditions for L(medium) and go there. */ 316 .align 3 317L(handletail32): 318 cmpldi cr1,rLEN,0 319 beqlr cr1 320 b L(medium) 321 322 .align 5 323L(small): 324/* Memset of 8 bytes or less. */ 325 cmpldi cr6, rLEN, 4 326 cmpldi cr5, rLEN, 1 327 ble cr6,L(le4) 328 subi rLEN, rLEN, 4 329 stb rCHR,0(rMEMP) 330 stb rCHR,1(rMEMP) 331 stb rCHR,2(rMEMP) 332 stb rCHR,3(rMEMP) 333 addi rMEMP,rMEMP, 4 334 cmpldi cr5, rLEN, 1 335L(le4): 336 cmpldi cr1, rLEN, 3 337 bltlr cr5 338 stb rCHR, 0(rMEMP) 339 beqlr cr5 340 stb rCHR, 1(rMEMP) 341 bltlr cr1 342 stb rCHR, 2(rMEMP) 343 beqlr cr1 344 stb rCHR, 3(rMEMP) 345 blr 346 347/* Memset of 0-31 bytes. */ 348 .align 5 349L(medium): 350 insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ 351 cmpldi cr1, rLEN, 16 352L(medium_tail2): 353 add rMEMP, rMEMP, rLEN 354L(medium_tail): 355 bt- 31, L(medium_31t) 356 bt- 30, L(medium_30t) 357L(medium_30f): 358 bt 29, L(medium_29t) 359L(medium_29f): 360 bge cr1, L(medium_27t) 361 bflr 28 362 std rCHR, -8(rMEMP) 363 blr 364 365L(medium_31t): 366 stbu rCHR, -1(rMEMP) 367 bf- 30, L(medium_30f) 368L(medium_30t): 369 sthu rCHR, -2(rMEMP) 370 bf- 29, L(medium_29f) 371L(medium_29t): 372 stwu rCHR, -4(rMEMP) 373 blt cr1, L(medium_27f) 374L(medium_27t): 375 std rCHR, -8(rMEMP) 376 stdu rCHR, -16(rMEMP) 377L(medium_27f): 378 bflr 28 379L(medium_28t): 380 std rCHR, -8(rMEMP) 381 blr 382END_GEN_TB (MEMSET,TB_TOCLESS) 383libc_hidden_builtin_def (memset) 384