1/* Optimized memset implementation for PowerPC64/POWER7. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. */ 23 24#ifndef MEMSET 25# define MEMSET memset 26#endif 27 .machine power7 28ENTRY_TOCLESS (MEMSET, 5) 29 CALL_MCOUNT 3 30 31L(_memset): 32 cmpldi cr7,5,31 33 cmpldi cr6,5,8 34 mr 10,3 35 36 /* Replicate byte to word. */ 37 insrdi 4,4,8,48 38 insrdi 4,4,16,32 39 ble cr6,L(small) /* If length <= 8, use short copy code. */ 40 41 neg 0,3 42 ble cr7,L(medium) /* If length < 32, use medium copy code. */ 43 44 andi. 11,10,7 /* Check alignment of SRC. */ 45 insrdi 4,4,32,0 /* Replicate word to double word. */ 46 47 mr 12,5 48 beq L(big_aligned) 49 50 clrldi 0,0,61 51 mtocrf 0x01,0 52 subf 5,0,5 53 54 /* Get DST aligned to 8 bytes. */ 551: bf 31,2f 56 57 stb 4,0(10) 58 addi 10,10,1 592: bf 30,4f 60 61 sth 4,0(10) 62 addi 10,10,2 634: bf 29,L(big_aligned) 64 65 stw 4,0(10) 66 addi 10,10,4 67 68 .align 4 69L(big_aligned): 70 71 cmpldi cr5,5,255 72 li 0,32 73 dcbtst 0,10 74 cmpldi cr6,4,0 75 srdi 9,5,3 /* Number of full doublewords remaining. */ 76 crand 27,26,21 77 mtocrf 0x01,9 78 bt 27,L(huge) 79 80 /* From this point on, we'll copy 32+ bytes and the value 81 isn't 0 (so we can't use dcbz). */ 82 83 srdi 8,5,5 84 clrldi 11,5,61 85 cmpldi cr6,11,0 86 cmpldi cr1,9,4 87 mtctr 8 88 89 /* Copy 1~3 doublewords so the main loop starts 90 at a multiple of 32 bytes. */ 91 92 bf 30,1f 93 94 std 4,0(10) 95 std 4,8(10) 96 addi 10,10,16 97 bf 31,L(big_loop) 98 99 std 4,0(10) 100 addi 10,10,8 101 mr 12,10 102 blt cr1,L(tail_bytes) 103 b L(big_loop) 104 105 .align 4 1061: /* Copy 1 doubleword. */ 107 bf 31,L(big_loop) 108 109 std 4,0(10) 110 addi 10,10,8 111 112 /* Main aligned copy loop. Copies 32-bytes at a time and 113 ping-pong through r10 and r12 to avoid AGEN delays. */ 114 .align 4 115L(big_loop): 116 addi 12,10,32 117 std 4,0(10) 118 std 4,8(10) 119 std 4,16(10) 120 std 4,24(10) 121 bdz L(tail_bytes) 122 123 addi 10,10,64 124 std 4,0(12) 125 std 4,8(12) 126 std 4,16(12) 127 std 4,24(12) 128 bdnz L(big_loop) 129 130 mr 12,10 131 b L(tail_bytes) 132 133 .align 4 134L(tail_bytes): 135 136 /* Check for tail bytes. */ 137 beqlr cr6 138 139 clrldi 0,5,61 140 mtocrf 0x01,0 141 142 /* At this point we have a tail of 0-7 bytes and we know that the 143 destination is doubleword-aligned. */ 1444: /* Copy 4 bytes. */ 145 bf 29,2f 146 147 stw 4,0(12) 148 addi 12,12,4 1492: /* Copy 2 bytes. */ 150 bf 30,1f 151 152 sth 4,0(12) 153 addi 12,12,2 1541: /* Copy 1 byte. */ 155 bflr 31 156 157 stb 4,0(12) 158 blr 159 160 /* Special case when value is 0 and we have a long length to deal 161 with. Use dcbz to zero out 128-bytes at a time. Before using 162 dcbz though, we need to get the destination 128-bytes aligned. */ 163 .align 4 164L(huge): 165 andi. 11,10,127 166 neg 0,10 167 beq L(huge_aligned) 168 169 clrldi 0,0,57 170 subf 5,0,5 171 srdi 0,0,3 172 mtocrf 0x01,0 173 174 /* Get DST aligned to 128 bytes. */ 1758: bf 28,4f 176 177 std 4,0(10) 178 std 4,8(10) 179 std 4,16(10) 180 std 4,24(10) 181 std 4,32(10) 182 std 4,40(10) 183 std 4,48(10) 184 std 4,56(10) 185 addi 10,10,64 186 .align 4 1874: bf 29,2f 188 189 std 4,0(10) 190 std 4,8(10) 191 std 4,16(10) 192 std 4,24(10) 193 addi 10,10,32 194 .align 4 1952: bf 30,1f 196 197 std 4,0(10) 198 std 4,8(10) 199 addi 10,10,16 200 .align 4 2011: bf 31,L(huge_aligned) 202 203 std 4,0(10) 204 addi 10,10,8 205 206 207L(huge_aligned): 208 srdi 8,5,7 209 clrldi 11,5,57 210 cmpldi cr6,11,0 211 mtctr 8 212 213 .align 4 214L(huge_loop): 215 dcbz 0,10 216 addi 10,10,128 217 bdnz L(huge_loop) 218 219 /* Check how many bytes are still left. */ 220 beqlr cr6 221 222 subf 9,3,10 223 subf 5,9,12 224 srdi 8,5,3 225 cmpldi cr6,8,0 226 mtocrf 0x01,8 227 228 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for 229 speed. We'll handle the resulting tail bytes later. */ 230 beq cr6,L(tail) 231 2328: bf 28,4f 233 234 std 4,0(10) 235 std 4,8(10) 236 std 4,16(10) 237 std 4,24(10) 238 std 4,32(10) 239 std 4,40(10) 240 std 4,48(10) 241 std 4,56(10) 242 addi 10,10,64 243 .align 4 2444: bf 29,2f 245 246 std 4,0(10) 247 std 4,8(10) 248 std 4,16(10) 249 std 4,24(10) 250 addi 10,10,32 251 .align 4 2522: bf 30,1f 253 254 std 4,0(10) 255 std 4,8(10) 256 addi 10,10,16 257 .align 4 2581: bf 31,L(tail) 259 260 std 4,0(10) 261 addi 10,10,8 262 263 /* Handle the rest of the tail bytes here. */ 264L(tail): 265 mtocrf 0x01,5 266 267 .align 4 2684: bf 29,2f 269 270 stw 4,0(10) 271 addi 10,10,4 272 .align 4 2732: bf 30,1f 274 275 sth 4,0(10) 276 addi 10,10,2 277 .align 4 2781: bflr 31 279 280 stb 4,0(10) 281 blr 282 283 /* Expanded tree to copy tail bytes without increments. */ 284 .align 4 285L(copy_tail): 286 bf 29,L(FXX) 287 288 stw 4,0(10) 289 bf 30,L(TFX) 290 291 sth 4,4(10) 292 bflr 31 293 294 stb 4,6(10) 295 blr 296 297 .align 4 298L(FXX): bf 30,L(FFX) 299 300 sth 4,0(10) 301 bflr 31 302 303 stb 4,2(10) 304 blr 305 306 .align 4 307L(TFX): bflr 31 308 309 stb 4,4(10) 310 blr 311 312 .align 4 313L(FFX): bflr 31 314 315 stb 4,0(10) 316 blr 317 318 /* Handle copies of 9~31 bytes. */ 319 .align 4 320L(medium): 321 /* At least 9 bytes to go. */ 322 andi. 11,10,3 323 clrldi 0,0,62 324 beq L(medium_aligned) 325 326 /* Force 4-bytes alignment for DST. */ 327 mtocrf 0x01,0 328 subf 5,0,5 3291: /* Copy 1 byte. */ 330 bf 31,2f 331 332 stb 4,0(10) 333 addi 10,10,1 3342: /* Copy 2 bytes. */ 335 bf 30,L(medium_aligned) 336 337 sth 4,0(10) 338 addi 10,10,2 339 340 .align 4 341L(medium_aligned): 342 /* At least 6 bytes to go, and DST is word-aligned. */ 343 cmpldi cr1,5,16 344 mtocrf 0x01,5 345 blt cr1,8f 346 347 /* Copy 16 bytes. */ 348 stw 4,0(10) 349 stw 4,4(10) 350 stw 4,8(10) 351 stw 4,12(10) 352 addi 10,10,16 3538: /* Copy 8 bytes. */ 354 bf 28,4f 355 356 stw 4,0(10) 357 stw 4,4(10) 358 addi 10,10,8 3594: /* Copy 4 bytes. */ 360 bf 29,2f 361 362 stw 4,0(10) 363 addi 10,10,4 3642: /* Copy 2-3 bytes. */ 365 bf 30,1f 366 367 sth 4,0(10) 368 addi 10,10,2 3691: /* Copy 1 byte. */ 370 bflr 31 371 372 stb 4,0(10) 373 blr 374 375 /* Handles copies of 0~8 bytes. */ 376 .align 4 377L(small): 378 mtocrf 0x01,5 379 bne cr6,L(copy_tail) 380 381 stw 4,0(10) 382 stw 4,4(10) 383 blr 384 385END_GEN_TB (MEMSET,TB_TOCLESS) 386libc_hidden_builtin_def (memset) 387