1/* Set a block of memory to some byte value. For SUN4V M7. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef XCC 22# define XCC xcc 23#endif 24 .register %g2, #scratch 25 .register %g3, #scratch 26 27/* The algorithm is as follows : 28 * 29 * For small 7 or fewer bytes stores, bytes will be stored. 30 * 31 * For less than 32 bytes stores, align the address on 4 byte boundary. 32 * Then store as many 4-byte chunks, followed by trailing bytes. 33 * 34 * For sizes greater than 32 bytes, align the address on 8 byte boundary. 35 * if (count >= 64) { 36 * store 8-bytes chunks to align the address on 64 byte boundary 37 * if (value to be set is zero && count >= MIN_ZERO) { 38 * Using BIS stores, set the first long word of each 39 * 64-byte cache line to zero which will also clear the 40 * other seven long words of the cache line. 41 * } 42 * else if (count >= MIN_LOOP) { 43 * Using BIS stores, set the first long word of each of 44 * ST_CHUNK cache lines (64 bytes each) before the main 45 * loop is entered. 46 * In the main loop, continue pre-setting the first long 47 * word of each cache line ST_CHUNK lines in advance while 48 * setting the other seven long words (56 bytes) of each 49 * cache line until fewer than ST_CHUNK*64 bytes remain. 50 * Then set the remaining seven long words of each cache 51 * line that has already had its first long word set. 52 * } 53 * store remaining data in 64-byte chunks until less than 54 * 64 bytes remain. 55 * } 56 * Store as many 8-byte chunks, followed by trailing bytes. 57 * 58 * 59 * BIS = Block Init Store 60 * Doing the advance store of the first element of the cache line 61 * initiates the displacement of a cache line while only using a single 62 * instruction in the pipeline. That avoids various pipeline delays, 63 * such as filling the miss buffer. The performance effect is 64 * similar to prefetching for normal stores. 65 * The special case for zero fills runs faster and uses fewer instruction 66 * cycles than the normal memset loop. 67 * 68 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence 69 * BIS stores must be followed by a membar #StoreStore. The benefit of 70 * the BIS store must be balanced against the cost of the membar operation. 71 */ 72 73/* 74 * ASI_STBI_P marks the cache line as "least recently used" 75 * which means if many threads are active, it has a high chance 76 * of being pushed out of the cache between the first initializing 77 * store and the final stores. 78 * Thus, we use ASI_STBIMRU_P which marks the cache line as 79 * "most recently used" for all but the last store to the cache line. 80 */ 81 82#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 83#define ASI_ST_BLK_INIT_MRU_P 0xf2 84 85#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 86#define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P 87 88#define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ 89#define MIN_LOOP (ST_CHUNK)*64 90#define MIN_ZERO 256 91 92#define EX_ST(x) x 93#define EX_RETVAL(x) x 94#define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P 95#define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P 96 97#if IS_IN (libc) 98 99 .text 100 .align 32 101 102ENTRY(__memset_niagara7) 103 /* memset (src, c, size) */ 104 mov %o0, %o5 /* copy sp1 before using it */ 105 cmp %o2, 7 /* if small counts, just write bytes */ 106 bleu,pn %XCC, .Lwrchar 107 and %o1, 0xff, %o1 /* o1 is (char)c */ 108 109 sll %o1, 8, %o3 110 or %o1, %o3, %o1 /* now o1 has 2 bytes of c */ 111 sll %o1, 16, %o3 112 cmp %o2, 32 113 blu,pn %XCC, .Lwdalign 114 or %o1, %o3, %o1 /* now o1 has 4 bytes of c */ 115 116 sllx %o1, 32, %o3 117 or %o1, %o3, %o1 /* now o1 has 8 bytes of c */ 118 119.Ldbalign: 120 andcc %o5, 7, %o3 /* is sp1 aligned on a 8 byte bound? */ 121 bz,pt %XCC, .Lblkalign /* already long word aligned */ 122 sub %o3, 8, %o3 /* -(bytes till long word aligned) */ 123 124 add %o2, %o3, %o2 /* update o2 with new count */ 125 /* Set -(%o3) bytes till sp1 long word aligned */ 1261: stb %o1, [%o5] /* there is at least 1 byte to set */ 127 inccc %o3 /* byte clearing loop */ 128 bl,pt %XCC, 1b 129 inc %o5 130 131 /* Now sp1 is long word aligned (sp1 is found in %o5) */ 132.Lblkalign: 133 cmp %o2, 64 /* check if there are 64 bytes to set */ 134 blu,pn %XCC, .Lwrshort 135 mov %o2, %o3 136 137 andcc %o5, 63, %o3 /* is sp1 block aligned? */ 138 bz,pt %XCC, .Lblkwr /* now block aligned */ 139 sub %o3, 64, %o3 /* o3 is -(bytes till block aligned) */ 140 add %o2, %o3, %o2 /* o2 is the remainder */ 141 142 /* Store -(%o3) bytes till dst is block (64 byte) aligned. */ 143 /* Use long word stores. */ 144 /* Recall that dst is already long word aligned */ 1451: 146 addcc %o3, 8, %o3 147 stx %o1, [%o5] 148 bl,pt %XCC, 1b 149 add %o5, 8, %o5 150 151 /* Now sp1 is block aligned */ 152.Lblkwr: 153 andn %o2, 63, %o4 /* calculate size of blocks in bytes */ 154 brz,pn %o1, .Lwrzero /* special case if c == 0 */ 155 and %o2, 63, %o3 /* %o3 = bytes left after blk stores */ 156 157 cmp %o4, MIN_LOOP /* check for enough bytes to set */ 158 blu,pn %XCC, .Lshort_set /* to justify cost of membar */ 159 nop /* must be > pre-cleared lines */ 160 161 /* initial cache-clearing stores */ 162 /* get store pipeline moving */ 163 164/* Primary memset loop for large memsets */ 165.Lwr_loop: 166 mov ST_CHUNK, %g1 167.Lwr_loop_start: 168 subcc %g1, 4, %g1 169 EX_ST(STORE_ASI(%o1,%o5)) 170 add %o5, 64, %o5 171 EX_ST(STORE_ASI(%o1,%o5)) 172 add %o5, 64, %o5 173 EX_ST(STORE_ASI(%o1,%o5)) 174 add %o5, 64, %o5 175 EX_ST(STORE_ASI(%o1,%o5)) 176 bgu %XCC, .Lwr_loop_start 177 add %o5, 64, %o5 178 179 sub %o5, ST_CHUNK*64, %o5 /* reset %o5 */ 180 mov ST_CHUNK, %g1 181 sub %o5, 8, %o5 /* adjust %o5 for ASI store */ 182 183.Lwr_loop_rest: 184 stx %o1,[%o5+8+8] 185 sub %o4, 64, %o4 186 stx %o1,[%o5+16+8] 187 subcc %g1, 1, %g1 188 stx %o1,[%o5+24+8] 189 stx %o1,[%o5+32+8] 190 stx %o1,[%o5+40+8] 191 add %o5, 64, %o5 192 stx %o1,[%o5-8] 193 bgu %XCC, .Lwr_loop_rest 194 EX_ST(STORE_INIT(%o1,%o5)) 195 196 add %o5, 8, %o5 /* restore %o5 offset */ 197 198 /* If more than ST_CHUNK*64 bytes remain to set, continue */ 199 /* setting the first long word of each cache line in advance */ 200 /* to keep the store pipeline moving. */ 201 202 cmp %o4, ST_CHUNK*64 203 bge,pt %XCC, .Lwr_loop_start 204 mov ST_CHUNK, %g1 205 206 brz,a,pn %o4, .Lasi_done 207 nop 208 209 sub %o5, 8, %o5 /* adjust %o5 for ASI store */ 210.Lwr_loop_small: 211 add %o5, 8, %o5 /* adjust %o5 for ASI store */ 212 EX_ST(STORE_ASI(%o1,%o5)) 213 stx %o1,[%o5+8] 214 stx %o1,[%o5+16] 215 stx %o1,[%o5+24] 216 stx %o1,[%o5+32] 217 subcc %o4, 64, %o4 218 stx %o1,[%o5+40] 219 add %o5, 56, %o5 220 stx %o1,[%o5-8] 221 bgu,pt %XCC, .Lwr_loop_small 222 EX_ST(STORE_INIT(%o1,%o5)) 223 224 ba .Lasi_done 225 add %o5, 8, %o5 /* restore %o5 offset */ 226 227/* Special case loop for zero fill memsets */ 228/* For each 64 byte cache line, single STBI to first element */ 229/* clears line */ 230.Lwrzero: 231 cmp %o4, MIN_ZERO /* check if enough bytes to set */ 232 /* to pay %asi + membar cost */ 233 blu %XCC, .Lshort_set 234 nop 235 sub %o4, 256, %o4 236 237.Lwrzero_loop: 238 mov 64, %g3 239 EX_ST(STORE_INIT(%o1,%o5)) 240 subcc %o4, 256, %o4 241 EX_ST(STORE_INIT(%o1,%o5+%g3)) 242 add %o5, 256, %o5 243 sub %g3, 192, %g3 244 EX_ST(STORE_INIT(%o1,%o5+%g3)) 245 add %g3, 64, %g3 246 bge,pt %XCC, .Lwrzero_loop 247 EX_ST(STORE_INIT(%o1,%o5+%g3)) 248 add %o4, 256, %o4 249 250 brz,pn %o4, .Lbsi_done 251 nop 252.Lwrzero_small: 253 EX_ST(STORE_INIT(%o1,%o5)) 254 subcc %o4, 64, %o4 255 bgu,pt %XCC, .Lwrzero_small 256 add %o5, 64, %o5 257 258.Lasi_done: 259.Lbsi_done: 260 membar #StoreStore /* required by use of BSI */ 261 262.Lshort_set: 263 cmp %o4, 64 /* check if 64 bytes to set */ 264 blu %XCC, 5f 265 nop 2664: /* set final blocks of 64 bytes */ 267 stx %o1, [%o5] 268 stx %o1, [%o5+8] 269 stx %o1, [%o5+16] 270 stx %o1, [%o5+24] 271 subcc %o4, 64, %o4 272 stx %o1, [%o5+32] 273 stx %o1, [%o5+40] 274 add %o5, 64, %o5 275 stx %o1, [%o5-16] 276 bgu,pt %XCC, 4b 277 stx %o1, [%o5-8] 278 2795: 280 /* Set the remaining long words */ 281.Lwrshort: 282 subcc %o3, 8, %o3 /* Can we store any long words? */ 283 blu,pn %XCC, .Lwrchars 284 and %o2, 7, %o2 /* calc bytes left after long words */ 2856: 286 subcc %o3, 8, %o3 287 stx %o1, [%o5] /* store the long words */ 288 bgeu,pt %XCC, 6b 289 add %o5, 8, %o5 290 291.Lwrchars: /* check for extra chars */ 292 brnz %o2, .Lwrfin 293 nop 294 retl 295 nop 296 297.Lwdalign: 298 andcc %o5, 3, %o3 /* is sp1 aligned on a word boundary */ 299 bz,pn %XCC, .Lwrword 300 andn %o2, 3, %o3 /* create word sized count in %o3 */ 301 302 dec %o2 /* decrement count */ 303 stb %o1, [%o5] /* clear a byte */ 304 b .Lwdalign 305 inc %o5 /* next byte */ 306 307.Lwrword: 308 subcc %o3, 4, %o3 309 st %o1, [%o5] /* 4-byte writing loop */ 310 bnz,pt %XCC, .Lwrword 311 add %o5, 4, %o5 312 and %o2, 3, %o2 /* leftover count, if any */ 313 314.Lwrchar: 315 /* Set the remaining bytes, if any */ 316 brz %o2, .Lexit 317 nop 318.Lwrfin: 319 deccc %o2 320 stb %o1, [%o5] 321 bgu,pt %XCC, .Lwrfin 322 inc %o5 323.Lexit: 324 retl /* %o0 was preserved */ 325 nop 326END(__memset_niagara7) 327#endif 328