1/* Optimized memset implementation for PowerPC64/POWER8. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. */ 23 24#ifndef MEMSET 25# define MEMSET memset 26#endif 27 .machine power8 28ENTRY_TOCLESS (MEMSET, 5) 29 CALL_MCOUNT 3 30 31L(_memset): 32 cmpldi cr7,r5,31 33 neg r0,r3 34 mr r10,r3 35 36 insrdi r4,r4,8,48 37 insrdi r4,r4,16,32 /* Replicate byte to word. */ 38 ble cr7,L(write_LT_32) 39 40 andi. r11,r10,15 /* Check alignment of DST. */ 41 insrdi r4,r4,32,0 /* Replicate word to double word. */ 42 43 beq L(big_aligned) 44 45 mtocrf 0x01,r0 46 clrldi r0,r0,60 47 48 /* Get DST aligned to 16 bytes. */ 491: bf 31,2f 50 stb r4,0(r10) 51 addi r10,r10,1 52 532: bf 30,4f 54 sth r4,0(r10) 55 addi r10,r10,2 56 574: bf 29,8f 58 stw r4,0(r10) 59 addi r10,r10,4 60 618: bf 28,16f 62 std r4,0(r10) 63 addi r10,r10,8 64 6516: subf r5,r0,r5 66 67 .align 4 68L(big_aligned): 69 /* For sizes larger than 255 two possible paths: 70 - if constant is '0', zero full cache lines with dcbz 71 - otherwise uses vector instructions. */ 72 cmpldi cr5,r5,255 73 dcbtst 0,r10 74 cmpldi cr6,r4,0 75 crand 27,26,21 76 bt 27,L(huge_dcbz) 77 bge cr5,L(huge_vector) 78 79 80 /* Size between 32 and 255 bytes with constant different than 0, use 81 doubleword store instruction to achieve best throughput. */ 82 srdi r8,r5,5 83 clrldi r11,r5,59 84 cmpldi cr6,r11,0 85 cmpdi r8,0 86 beq L(tail_bytes) 87 mtctr r8 88 89 /* Main aligned write loop, writes 32-bytes at a time. */ 90 .align 4 91L(big_loop): 92 std r4,0(r10) 93 std r4,8(r10) 94 std r4,16(r10) 95 std r4,24(r10) 96 addi r10,r10,32 97 bdz L(tail_bytes) 98 99 std r4,0(r10) 100 std r4,8(r10) 101 std r4,16(r10) 102 std r4,24(r10) 103 addi r10,10,32 104 bdnz L(big_loop) 105 106 b L(tail_bytes) 107 108 /* Write remaining 1~31 bytes. */ 109 .align 4 110L(tail_bytes): 111 beqlr cr6 112 113 srdi r7,r11,4 114 clrldi r8,r11,60 115 mtocrf 0x01,r7 116 117 .align 4 118 bf 31,8f 119 std r4,0(r10) 120 std r4,8(r10) 121 addi r10,r10,16 122 123 .align 4 1248: mtocrf 0x1,r8 125 bf 28,4f 126 std r4,0(r10) 127 addi r10,r10,8 128 129 .align 4 1304: bf 29,2f 131 stw 4,0(10) 132 addi 10,10,4 133 134 .align 4 1352: bf 30,1f 136 sth 4,0(10) 137 addi 10,10,2 138 139 .align 4 1401: bflr 31 141 stb 4,0(10) 142 blr 143 144 /* Size larger than 255 bytes with constant different than 0, use 145 vector instruction to achieve best throughput. */ 146L(huge_vector): 147 /* Replicate set byte to quadword in VMX register. */ 148 mtvsrd v1,r4 149 xxpermdi 32,v0,v1,0 150 vspltb v2,v0,15 151 152 /* Main aligned write loop: 128 bytes at a time. */ 153 li r6,16 154 li r7,32 155 li r8,48 156 mtocrf 0x02,r5 157 srdi r12,r5,7 158 cmpdi r12,0 159 beq L(aligned_tail) 160 mtctr r12 161 b L(aligned_128loop) 162 163 .align 4 164L(aligned_128loop): 165 stvx v2,0,r10 166 stvx v2,r10,r6 167 stvx v2,r10,r7 168 stvx v2,r10,r8 169 addi r10,r10,64 170 stvx v2,0,r10 171 stvx v2,r10,r6 172 stvx v2,r10,r7 173 stvx v2,r10,r8 174 addi r10,r10,64 175 bdnz L(aligned_128loop) 176 177 /* Write remaining 1~127 bytes. */ 178L(aligned_tail): 179 mtocrf 0x01,r5 180 bf 25,32f 181 stvx v2,0,r10 182 stvx v2,r10,r6 183 stvx v2,r10,r7 184 stvx v2,r10,r8 185 addi r10,r10,64 186 18732: bf 26,16f 188 stvx v2,0,r10 189 stvx v2,r10,r6 190 addi r10,r10,32 191 19216: bf 27,8f 193 stvx v2,0,r10 194 addi r10,r10,16 195 1968: bf 28,4f 197 std r4,0(r10) 198 addi r10,r10,8 199 200 /* Copies 4~7 bytes. */ 2014: bf 29,L(tail2) 202 stw r4,0(r10) 203 bf 30,L(tail5) 204 sth r4,4(r10) 205 bflr 31 206 stb r4,6(r10) 207 /* Return original DST pointer. */ 208 blr 209 210 /* Special case when value is 0 and we have a long length to deal 211 with. Use dcbz to zero out a full cacheline of 128 bytes at a time. 212 Before using dcbz though, we need to get the destination 128-byte 213 aligned. */ 214 .align 4 215L(huge_dcbz): 216 andi. r11,r10,127 217 neg r0,r10 218 beq L(huge_dcbz_aligned) 219 220 clrldi r0,r0,57 221 subf r5,r0,r5 222 srdi r0,r0,3 223 mtocrf 0x01,r0 224 225 /* Write 1~128 bytes until DST is aligned to 128 bytes. */ 2268: bf 28,4f 227 228 std r4,0(r10) 229 std r4,8(r10) 230 std r4,16(r10) 231 std r4,24(r10) 232 std r4,32(r10) 233 std r4,40(r10) 234 std r4,48(r10) 235 std r4,56(r10) 236 addi r10,r10,64 237 238 .align 4 2394: bf 29,2f 240 std r4,0(r10) 241 std r4,8(r10) 242 std r4,16(r10) 243 std r4,24(r10) 244 addi r10,r10,32 245 246 .align 4 2472: bf 30,1f 248 std r4,0(r10) 249 std r4,8(r10) 250 addi r10,r10,16 251 252 .align 4 2531: bf 31,L(huge_dcbz_aligned) 254 std r4,0(r10) 255 addi r10,r10,8 256 257L(huge_dcbz_aligned): 258 /* Setup dcbz unroll offsets and count numbers. */ 259 srdi r8,r5,9 260 clrldi r11,r5,55 261 cmpldi cr6,r11,0 262 li r9,128 263 cmpdi r8,0 264 beq L(huge_tail) 265 li r7,256 266 li r6,384 267 mtctr r8 268 269 .align 4 270L(huge_loop): 271 /* Sets 512 bytes to zero in each iteration, the loop unrolling shows 272 a throughput boost for large sizes (2048 bytes or higher). */ 273 dcbz 0,r10 274 dcbz r9,r10 275 dcbz r7,r10 276 dcbz r6,r10 277 addi r10,r10,512 278 bdnz L(huge_loop) 279 280 beqlr cr6 281 282L(huge_tail): 283 srdi r6,r11,8 284 srdi r7,r11,4 285 clrldi r8,r11,4 286 cmpldi cr6,r8,0 287 mtocrf 0x01,r6 288 289 beq cr6,L(tail) 290 291 /* We have 1~511 bytes remaining. */ 292 .align 4 29332: bf 31,16f 294 dcbz 0,r10 295 dcbz r9,r10 296 addi r10,r10,256 297 298 .align 4 29916: mtocrf 0x01,r7 300 bf 28,8f 301 dcbz 0,r10 302 addi r10,r10,128 303 304 .align 4 3058: bf 29,4f 306 std r4,0(r10) 307 std r4,8(r10) 308 std r4,16(r10) 309 std r4,24(r10) 310 std r4,32(r10) 311 std r4,40(r10) 312 std r4,48(r10) 313 std r4,56(r10) 314 addi r10,r10,64 315 316 .align 4 3174: bf 30,2f 318 std r4,0(r10) 319 std r4,8(r10) 320 std r4,16(r10) 321 std r4,24(r10) 322 addi r10,r10,32 323 324 .align 4 3252: bf 31,L(tail) 326 std r4,0(r10) 327 std r4,8(r10) 328 addi r10,r10,16 329 .align 4 330 331 /* Remaining 1~15 bytes. */ 332L(tail): 333 mtocrf 0x01,r8 334 335 .align 3368: bf 28,4f 337 std r4,0(r10) 338 addi r10,r10,8 339 340 .align 4 3414: bf 29,2f 342 stw r4,0(r10) 343 addi r10,r10,4 344 345 .align 4 3462: bf 30,1f 347 sth r4,0(r10) 348 addi r10,r10,2 349 350 .align 4 3511: bflr 31 352 stb r4,0(r10) 353 blr 354 355 /* Handle short copies of 0~31 bytes. Best throughput is achieved 356 by just unrolling all operations. */ 357 .align 4 358L(write_LT_32): 359 cmpldi cr6,5,8 360 mtocrf 0x01,r5 361 ble cr6,L(write_LE_8) 362 363 /* At least 9 bytes to go. */ 364 neg r8,r4 365 andi. r0,r8,3 366 cmpldi cr1,r5,16 367 beq L(write_LT_32_aligned) 368 369 /* Force 4-byte alignment for SRC. */ 370 mtocrf 0x01,r0 371 subf r5,r0,r5 372 3732: bf 30,1f 374 /* Use stb instead of sth because it doesn't generate 375 alignment interrupts on cache-inhibited storage. */ 376 stb r4,0(r10) 377 stb r4,1(r10) 378 addi r10,r10,2 379 3801: bf 31,L(end_4bytes_alignment) 381 stb r4,0(r10) 382 addi r10,r10,1 383 384 .align 4 385L(end_4bytes_alignment): 386 cmpldi cr1,r5,16 387 mtocrf 0x01,r5 388 389L(write_LT_32_aligned): 390 blt cr1,8f 391 392 stw r4,0(r10) 393 stw r4,4(r10) 394 stw r4,8(r10) 395 stw r4,12(r10) 396 addi r10,r10,16 397 3988: bf 28,L(tail4) 399 stw r4,0(r10) 400 stw r4,4(r10) 401 addi r10,r10,8 402 403 .align 4 404 /* Copies 4~7 bytes. */ 405L(tail4): 406 bf 29,L(tail2) 407 stw r4,0(r10) 408 bf 30,L(tail5) 409 sth r4,4(r10) 410 bflr 31 411 stb r4,6(r10) 412 blr 413 414 .align 4 415 /* Copies 2~3 bytes. */ 416L(tail2): 417 bf 30,1f 418 sth r4,0(r10) 419 bflr 31 420 stb r4,2(r10) 421 blr 422 423 .align 4 424L(tail5): 425 bflr 31 426 stb r4,4(r10) 427 blr 428 429 .align 4 4301: bflr 31 431 stb r4,0(r10) 432 blr 433 434 /* Handles copies of 0~8 bytes. */ 435 .align 4 436L(write_LE_8): 437 bne cr6,L(LE7_tail4) 438 /* If input is word aligned, use stw, else use stb. */ 439 andi. r0,r10,3 440 bne L(8_unalign) 441 442 stw r4,0(r10) 443 stw r4,4(r10) 444 blr 445 446 /* Unaligned input and size is 8. */ 447 .align 4 448L(8_unalign): 449 andi. r0,r10,1 450 beq L(8_hwalign) 451 stb r4,0(r10) 452 sth r4,1(r10) 453 sth r4,3(r10) 454 sth r4,5(r10) 455 stb r4,7(r10) 456 blr 457 458 /* Halfword aligned input and size is 8. */ 459 .align 4 460L(8_hwalign): 461 sth r4,0(r10) 462 sth r4,2(r10) 463 sth r4,4(r10) 464 sth r4,6(r10) 465 blr 466 467 .align 4 468 /* Copies 4~7 bytes. */ 469L(LE7_tail4): 470 /* Use stb instead of sth because it doesn't generate 471 alignment interrupts on cache-inhibited storage. */ 472 bf 29,L(LE7_tail2) 473 stb r4,0(r10) 474 stb r4,1(r10) 475 stb r4,2(r10) 476 stb r4,3(r10) 477 bf 30,L(LE7_tail5) 478 stb r4,4(r10) 479 stb r4,5(r10) 480 bflr 31 481 stb r4,6(r10) 482 blr 483 484 .align 4 485 /* Copies 2~3 bytes. */ 486L(LE7_tail2): 487 bf 30,1f 488 stb r4,0(r10) 489 stb r4,1(r10) 490 bflr 31 491 stb r4,2(r10) 492 blr 493 494 .align 4 495L(LE7_tail5): 496 bflr 31 497 stb r4,4(r10) 498 blr 499 500 .align 4 5011: bflr 31 502 stb r4,0(r10) 503 blr 504 505END_GEN_TB (MEMSET,TB_TOCLESS) 506libc_hidden_builtin_def (memset) 507