1/* Optimized memset implementation for PowerPC32/POWER7. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. */ 23 24 .machine power7 25EALIGN (memset, 5, 0) 26 CALL_MCOUNT 27 28 .align 4 29L(_memset): 30 cmplwi cr7,5,31 31 cmplwi cr6,5,8 32 mr 10,3 /* Save original argument for later. */ 33 mr 7,1 /* Save original r1 for later. */ 34 cfi_offset(31,-8) 35 36 /* Replicate byte to word. */ 37 insrwi 4,4,8,16 38 insrwi 4,4,16,0 39 40 ble cr6,L(small) /* If length <= 8, use short copy code. */ 41 42 neg 0,3 43 ble cr7,L(medium) /* If length < 32, use medium copy code. */ 44 45 /* Save our word twice to create a doubleword that we will later 46 copy to a FPR. */ 47 stwu 1,-32(1) 48 andi. 11,10,7 /* Check alignment of DST. */ 49 mr 12,5 50 stw 4,24(1) 51 stw 4,28(1) 52 beq L(big_aligned) 53 54 clrlwi 0,0,29 55 mtocrf 0x01,0 56 subf 5,0,5 57 58 /* Get DST aligned to 8 bytes. */ 591: bf 31,2f 60 61 stb 4,0(10) 62 addi 10,10,1 632: bf 30,4f 64 65 sth 4,0(10) 66 addi 10,10,2 674: bf 29,L(big_aligned) 68 69 stw 4,0(10) 70 addi 10,10,4 71 72 .align 4 73L(big_aligned): 74 cmplwi cr5,5,255 75 li 0,32 76 cmplwi cr1,5,160 77 dcbtst 0,10 78 cmplwi cr6,4,0 79 srwi 9,5,3 /* Number of full doublewords remaining. */ 80 crand 27,26,21 81 mtocrf 0x01,9 82 bt 27,L(huge) 83 84 /* From this point on, we'll copy 32+ bytes and the value 85 isn't 0 (so we can't use dcbz). */ 86 87 srwi 8,5,5 88 clrlwi 11,5,29 89 cmplwi cr6,11,0 90 cmplwi cr1,9,4 91 mtctr 8 92 93 /* Copy 1~3 doublewords so the main loop starts 94 at a multiple of 32 bytes. */ 95 96 bf 30,1f 97 98 stw 4,0(10) 99 stw 4,4(10) 100 stw 4,8(10) 101 stw 4,12(10) 102 addi 10,10,16 103 bf 31,L(big_loop) 104 105 stw 4,0(10) 106 stw 4,4(10) 107 addi 10,10,8 108 mr 12,10 109 blt cr1,L(tail_bytes) 110 111 b L(big_loop) 112 113 .align 4 1141: /* Copy 1 doubleword. */ 115 bf 31,L(big_loop) 116 117 stw 4,0(10) 118 stw 4,4(10) 119 addi 10,10,8 120 121 /* First use a 32-bytes loop with stw's to try and avoid the LHS due 122 to the lfd we will do next. Also, ping-pong through r10 and r12 123 to avoid AGEN delays. */ 124 .align 4 125L(big_loop): 126 addi 12,10,32 127 stw 4,0(10) 128 stw 4,4(10) 129 stw 4,8(10) 130 stw 4,12(10) 131 stw 4,16(10) 132 stw 4,20(10) 133 stw 4,24(10) 134 stw 4,28(10) 135 bdz L(tail_bytes) 136 137 addi 10,10,64 138 stw 4,0(12) 139 stw 4,4(12) 140 stw 4,8(12) 141 stw 4,12(12) 142 stw 4,16(12) 143 stw 4,20(12) 144 stw 4,24(12) 145 stw 4,28(12) 146 bdnz L(big_loop_fast_setup) 147 148 mr 12,10 149 b L(tail_bytes) 150 151 /* Now that we're probably past the LHS window, use the VSX to 152 speed up the loop. */ 153L(big_loop_fast_setup): 154 li 11,24 155 li 6,16 156 lxvdsx 4,1,11 157 158 .align 4 159L(big_loop_fast): 160 addi 12,10,32 161 stxvd2x 4,0,10 162 stxvd2x 4,10,6 163 bdz L(tail_bytes) 164 165 addi 10,10,64 166 stxvd2x 4,0,12 167 stxvd2x 4,12,6 168 bdnz L(big_loop_fast) 169 170 mr 12,10 171 172 .align 4 173L(tail_bytes): 174 175 /* Check for tail bytes. */ 176 mr 1,7 /* Restore r1. */ 177 beqlr cr6 178 179 clrlwi 0,5,29 180 mtocrf 0x01,0 181 182 /* At this point we have a tail of 0-7 bytes and we know that the 183 destination is doubleword-aligned. */ 1844: /* Copy 4 bytes. */ 185 bf 29,2f 186 187 stw 4,0(12) 188 addi 12,12,4 1892: /* Copy 2 bytes. */ 190 bf 30,1f 191 192 sth 4,0(12) 193 addi 12,12,2 1941: /* Copy 1 byte. */ 195 bflr 31 196 197 stb 4,0(12) 198 blr 199 200 201 /* Special case when value is 0 and we have a long length to deal 202 with. Use dcbz to zero out 128-bytes at a time. Before using 203 dcbz though, we need to get the destination 128-bytes aligned. */ 204 .align 4 205L(huge): 206 lfd 4,24(1) 207 andi. 11,10,127 208 neg 0,10 209 beq L(huge_aligned) 210 211 clrlwi 0,0,25 212 subf 5,0,5 213 srwi 0,0,3 214 mtocrf 0x01,0 215 216 /* Get DST aligned to 128 bytes. */ 2178: bf 28,4f 218 219 stfd 4,0(10) 220 stfd 4,8(10) 221 stfd 4,16(10) 222 stfd 4,24(10) 223 stfd 4,32(10) 224 stfd 4,40(10) 225 stfd 4,48(10) 226 stfd 4,56(10) 227 addi 10,10,64 228 .align 4 2294: bf 29,2f 230 231 stfd 4,0(10) 232 stfd 4,8(10) 233 stfd 4,16(10) 234 stfd 4,24(10) 235 addi 10,10,32 236 .align 4 2372: bf 30,1f 238 239 stfd 4,0(10) 240 stfd 4,8(10) 241 addi 10,10,16 242 .align 4 2431: bf 31,L(huge_aligned) 244 245 stfd 4,0(10) 246 addi 10,10,8 247 248L(huge_aligned): 249 srwi 8,5,7 250 clrlwi 11,5,25 251 cmplwi cr6,11,0 252 mtctr 8 253 254 /* Copies 128-bytes at a time. */ 255 .align 4 256L(huge_loop): 257 dcbz 0,10 258 addi 10,10,128 259 bdnz L(huge_loop) 260 261 /* We have a tail of 0~127 bytes to handle. */ 262 mr 1,7 /* Restore r1. */ 263 beqlr cr6 264 265 subf 9,3,10 266 subf 5,9,12 267 srwi 8,5,3 268 cmplwi cr6,8,0 269 mtocrf 0x01,8 270 271 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for 272 speed. We'll handle the resulting tail bytes later. */ 273 beq cr6,L(tail) 274 2758: bf 28,4f 276 277 stfd 4,0(10) 278 stfd 4,8(10) 279 stfd 4,16(10) 280 stfd 4,24(10) 281 stfd 4,32(10) 282 stfd 4,40(10) 283 stfd 4,48(10) 284 stfd 4,56(10) 285 addi 10,10,64 286 .align 4 2874: bf 29,2f 288 289 stfd 4,0(10) 290 stfd 4,8(10) 291 stfd 4,16(10) 292 stfd 4,24(10) 293 addi 10,10,32 294 .align 4 2952: bf 30,1f 296 297 stfd 4,0(10) 298 stfd 4,8(10) 299 addi 10,10,16 300 .align 4 3011: bf 31,L(tail) 302 303 stfd 4,0(10) 304 addi 10,10,8 305 306 /* Handle the rest of the tail bytes here. */ 307L(tail): 308 mtocrf 0x01,5 309 310 .align 4 3114: bf 29,2f 312 313 stw 4,0(10) 314 addi 10,10,4 315 .align 4 3162: bf 30,1f 317 318 sth 4,0(10) 319 addi 10,10,2 320 .align 4 3211: bflr 31 322 323 stb 4,0(10) 324 blr 325 326 327 /* Expanded tree to copy tail bytes without increments. */ 328 .align 4 329L(copy_tail): 330 bf 29,L(FXX) 331 332 stw 4,0(10) 333 bf 30,L(TFX) 334 335 sth 4,4(10) 336 bflr 31 337 338 stb 4,6(10) 339 blr 340 341 .align 4 342L(FXX): bf 30,L(FFX) 343 344 sth 4,0(10) 345 bflr 31 346 347 stb 4,2(10) 348 blr 349 350 .align 4 351L(TFX): bflr 31 352 353 stb 4,4(10) 354 blr 355 356 .align 4 357L(FFX): bflr 31 358 359 stb 4,0(10) 360 blr 361 362 /* Handle copies of 9~31 bytes. */ 363 .align 4 364L(medium): 365 /* At least 9 bytes to go. */ 366 andi. 11,10,3 367 clrlwi 0,0,30 368 beq L(medium_aligned) 369 370 /* Force 4-bytes alignment for DST. */ 371 mtocrf 0x01,0 372 subf 5,0,5 3731: /* Copy 1 byte. */ 374 bf 31,2f 375 376 stb 4,0(10) 377 addi 10,10,1 3782: /* Copy 2 bytes. */ 379 bf 30,L(medium_aligned) 380 381 sth 4,0(10) 382 addi 10,10,2 383 384 .align 4 385L(medium_aligned): 386 /* At least 6 bytes to go, and DST is word-aligned. */ 387 cmplwi cr1,5,16 388 mtocrf 0x01,5 389 blt cr1,8f 390 391 /* Copy 16 bytes. */ 392 stw 4,0(10) 393 stw 4,4(10) 394 stw 4,8(10) 395 stw 4,12(10) 396 addi 10,10,16 3978: /* Copy 8 bytes. */ 398 bf 28,4f 399 400 stw 4,0(10) 401 stw 4,4(10) 402 addi 10,10,8 4034: /* Copy 4 bytes. */ 404 bf 29,2f 405 406 stw 4,0(10) 407 addi 10,10,4 4082: /* Copy 2-3 bytes. */ 409 bf 30,1f 410 411 sth 4,0(10) 412 addi 10,10,2 4131: /* Copy 1 byte. */ 414 bflr 31 415 416 stb 4,0(10) 417 blr 418 419 /* Handles copies of 0~8 bytes. */ 420 .align 4 421L(small): 422 mtocrf 0x01,5 423 bne cr6,L(copy_tail) 424 425 stw 4,0(10) 426 stw 4,4(10) 427 blr 428 429END (memset) 430libc_hidden_builtin_def (memset) 431