1/* strcpy with AVX2 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23 24# ifndef USE_AS_STRCAT 25# include <sysdep.h> 26 27# ifndef STRCPY 28# define STRCPY __strcpy_avx2 29# endif 30 31# endif 32 33/* Number of bytes in a vector register */ 34# ifndef VEC_SIZE 35# define VEC_SIZE 32 36# endif 37 38# ifndef VZEROUPPER 39# define VZEROUPPER vzeroupper 40# endif 41 42# ifndef SECTION 43# define SECTION(p) p##.avx 44# endif 45 46/* zero register */ 47#define xmmZ xmm0 48#define ymmZ ymm0 49 50/* mask register */ 51#define ymmM ymm1 52 53# ifndef USE_AS_STRCAT 54 55 .section SECTION(.text),"ax",@progbits 56ENTRY (STRCPY) 57# ifdef USE_AS_STRNCPY 58 mov %RDX_LP, %R8_LP 59 test %R8_LP, %R8_LP 60 jz L(ExitZero) 61# endif 62 mov %rsi, %rcx 63# ifndef USE_AS_STPCPY 64 mov %rdi, %rax /* save result */ 65# endif 66 67# endif 68 69 vpxor %xmmZ, %xmmZ, %xmmZ 70 71 and $((VEC_SIZE * 4) - 1), %ecx 72 cmp $(VEC_SIZE * 2), %ecx 73 jbe L(SourceStringAlignmentLessTwoVecSize) 74 75 and $-VEC_SIZE, %rsi 76 and $(VEC_SIZE - 1), %ecx 77 78 vpcmpeqb (%rsi), %ymmZ, %ymmM 79 vpmovmskb %ymmM, %edx 80 shr %cl, %rdx 81 82# ifdef USE_AS_STRNCPY 83# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 84 mov $VEC_SIZE, %r10 85 sub %rcx, %r10 86 cmp %r10, %r8 87# else 88 mov $(VEC_SIZE + 1), %r10 89 sub %rcx, %r10 90 cmp %r10, %r8 91# endif 92 jbe L(CopyVecSizeTailCase2OrCase3) 93# endif 94 test %edx, %edx 95 jnz L(CopyVecSizeTail) 96 97 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 98 vpmovmskb %ymm2, %edx 99 100# ifdef USE_AS_STRNCPY 101 add $VEC_SIZE, %r10 102 cmp %r10, %r8 103 jbe L(CopyTwoVecSizeCase2OrCase3) 104# endif 105 test %edx, %edx 106 jnz L(CopyTwoVecSize) 107 108 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ 109 vmovdqu %ymm2, (%rdi) 110 111/* If source address alignment != destination address alignment */ 112 .p2align 4 113L(UnalignVecSizeBoth): 114 sub %rcx, %rdi 115# ifdef USE_AS_STRNCPY 116 add %rcx, %r8 117 sbb %rcx, %rcx 118 or %rcx, %r8 119# endif 120 mov $VEC_SIZE, %rcx 121 vmovdqa (%rsi, %rcx), %ymm2 122 vmovdqu %ymm2, (%rdi, %rcx) 123 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 124 vpcmpeqb %ymm2, %ymmZ, %ymmM 125 vpmovmskb %ymmM, %edx 126 add $VEC_SIZE, %rcx 127# ifdef USE_AS_STRNCPY 128 sub $(VEC_SIZE * 3), %r8 129 jbe L(CopyVecSizeCase2OrCase3) 130# endif 131 test %edx, %edx 132# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 133 jnz L(CopyVecSizeUnalignedVec2) 134# else 135 jnz L(CopyVecSize) 136# endif 137 138 vmovdqu %ymm2, (%rdi, %rcx) 139 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 140 vpcmpeqb %ymm3, %ymmZ, %ymmM 141 vpmovmskb %ymmM, %edx 142 add $VEC_SIZE, %rcx 143# ifdef USE_AS_STRNCPY 144 sub $VEC_SIZE, %r8 145 jbe L(CopyVecSizeCase2OrCase3) 146# endif 147 test %edx, %edx 148# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 149 jnz L(CopyVecSizeUnalignedVec3) 150# else 151 jnz L(CopyVecSize) 152# endif 153 154 vmovdqu %ymm3, (%rdi, %rcx) 155 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 156 vpcmpeqb %ymm4, %ymmZ, %ymmM 157 vpmovmskb %ymmM, %edx 158 add $VEC_SIZE, %rcx 159# ifdef USE_AS_STRNCPY 160 sub $VEC_SIZE, %r8 161 jbe L(CopyVecSizeCase2OrCase3) 162# endif 163 test %edx, %edx 164# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 165 jnz L(CopyVecSizeUnalignedVec4) 166# else 167 jnz L(CopyVecSize) 168# endif 169 170 vmovdqu %ymm4, (%rdi, %rcx) 171 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 172 vpcmpeqb %ymm2, %ymmZ, %ymmM 173 vpmovmskb %ymmM, %edx 174 add $VEC_SIZE, %rcx 175# ifdef USE_AS_STRNCPY 176 sub $VEC_SIZE, %r8 177 jbe L(CopyVecSizeCase2OrCase3) 178# endif 179 test %edx, %edx 180# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 181 jnz L(CopyVecSizeUnalignedVec2) 182# else 183 jnz L(CopyVecSize) 184# endif 185 186 vmovdqu %ymm2, (%rdi, %rcx) 187 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 188 vpcmpeqb %ymm2, %ymmZ, %ymmM 189 vpmovmskb %ymmM, %edx 190 add $VEC_SIZE, %rcx 191# ifdef USE_AS_STRNCPY 192 sub $VEC_SIZE, %r8 193 jbe L(CopyVecSizeCase2OrCase3) 194# endif 195 test %edx, %edx 196# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 197 jnz L(CopyVecSizeUnalignedVec2) 198# else 199 jnz L(CopyVecSize) 200# endif 201 202 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 203 vmovdqu %ymm2, (%rdi, %rcx) 204 vpcmpeqb %ymm3, %ymmZ, %ymmM 205 vpmovmskb %ymmM, %edx 206 add $VEC_SIZE, %rcx 207# ifdef USE_AS_STRNCPY 208 sub $VEC_SIZE, %r8 209 jbe L(CopyVecSizeCase2OrCase3) 210# endif 211 test %edx, %edx 212# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 213 jnz L(CopyVecSizeUnalignedVec3) 214# else 215 jnz L(CopyVecSize) 216# endif 217 218 vmovdqu %ymm3, (%rdi, %rcx) 219 mov %rsi, %rdx 220 lea VEC_SIZE(%rsi, %rcx), %rsi 221 and $-(VEC_SIZE * 4), %rsi 222 sub %rsi, %rdx 223 sub %rdx, %rdi 224# ifdef USE_AS_STRNCPY 225 lea (VEC_SIZE * 8)(%r8, %rdx), %r8 226# endif 227L(UnalignedFourVecSizeLoop): 228 vmovdqa (%rsi), %ymm4 229 vmovdqa VEC_SIZE(%rsi), %ymm5 230 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 231 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 232 vpminub %ymm5, %ymm4, %ymm2 233 vpminub %ymm7, %ymm6, %ymm3 234 vpminub %ymm2, %ymm3, %ymm3 235 vpcmpeqb %ymmM, %ymm3, %ymm3 236 vpmovmskb %ymm3, %edx 237# ifdef USE_AS_STRNCPY 238 sub $(VEC_SIZE * 4), %r8 239 jbe L(UnalignedLeaveCase2OrCase3) 240# endif 241 test %edx, %edx 242 jnz L(UnalignedFourVecSizeLeave) 243 244L(UnalignedFourVecSizeLoop_start): 245 add $(VEC_SIZE * 4), %rdi 246 add $(VEC_SIZE * 4), %rsi 247 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) 248 vmovdqa (%rsi), %ymm4 249 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) 250 vmovdqa VEC_SIZE(%rsi), %ymm5 251 vpminub %ymm5, %ymm4, %ymm2 252 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) 253 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 254 vmovdqu %ymm7, -VEC_SIZE(%rdi) 255 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 256 vpminub %ymm7, %ymm6, %ymm3 257 vpminub %ymm2, %ymm3, %ymm3 258 vpcmpeqb %ymmM, %ymm3, %ymm3 259 vpmovmskb %ymm3, %edx 260# ifdef USE_AS_STRNCPY 261 sub $(VEC_SIZE * 4), %r8 262 jbe L(UnalignedLeaveCase2OrCase3) 263# endif 264 test %edx, %edx 265 jz L(UnalignedFourVecSizeLoop_start) 266 267L(UnalignedFourVecSizeLeave): 268 vpcmpeqb %ymm4, %ymmZ, %ymmM 269 vpmovmskb %ymmM, %edx 270 test %edx, %edx 271 jnz L(CopyVecSizeUnaligned_0) 272 273 vpcmpeqb %ymm5, %ymmZ, %ymmM 274 vpmovmskb %ymmM, %ecx 275 test %ecx, %ecx 276 jnz L(CopyVecSizeUnaligned_16) 277 278 vpcmpeqb %ymm6, %ymmZ, %ymmM 279 vpmovmskb %ymmM, %edx 280 test %edx, %edx 281 jnz L(CopyVecSizeUnaligned_32) 282 283 vpcmpeqb %ymm7, %ymmZ, %ymmM 284 vpmovmskb %ymmM, %ecx 285 bsf %ecx, %edx 286 vmovdqu %ymm4, (%rdi) 287 vmovdqu %ymm5, VEC_SIZE(%rdi) 288 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 289# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 290# ifdef USE_AS_STPCPY 291 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax 292# endif 293 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) 294 add $(VEC_SIZE - 1), %r8 295 sub %rdx, %r8 296 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi 297 jmp L(StrncpyFillTailWithZero) 298# else 299 add $(VEC_SIZE * 3), %rsi 300 add $(VEC_SIZE * 3), %rdi 301 jmp L(CopyVecSizeExit) 302# endif 303 304/* If source address alignment == destination address alignment */ 305 306L(SourceStringAlignmentLessTwoVecSize): 307 vmovdqu (%rsi), %ymm3 308 vmovdqu VEC_SIZE(%rsi), %ymm2 309 vpcmpeqb %ymm3, %ymmZ, %ymmM 310 vpmovmskb %ymmM, %edx 311 312# ifdef USE_AS_STRNCPY 313# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 314 cmp $VEC_SIZE, %r8 315# else 316 cmp $(VEC_SIZE + 1), %r8 317# endif 318 jbe L(CopyVecSizeTail1Case2OrCase3) 319# endif 320 test %edx, %edx 321 jnz L(CopyVecSizeTail1) 322 323 vmovdqu %ymm3, (%rdi) 324 vpcmpeqb %ymm2, %ymmZ, %ymmM 325 vpmovmskb %ymmM, %edx 326 327# ifdef USE_AS_STRNCPY 328# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 329 cmp $(VEC_SIZE * 2), %r8 330# else 331 cmp $((VEC_SIZE * 2) + 1), %r8 332# endif 333 jbe L(CopyTwoVecSize1Case2OrCase3) 334# endif 335 test %edx, %edx 336 jnz L(CopyTwoVecSize1) 337 338 and $-VEC_SIZE, %rsi 339 and $(VEC_SIZE - 1), %ecx 340 jmp L(UnalignVecSizeBoth) 341 342/*------End of main part with loops---------------------*/ 343 344/* Case1 */ 345 346# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) 347 .p2align 4 348L(CopyVecSize): 349 add %rcx, %rdi 350# endif 351L(CopyVecSizeTail): 352 add %rcx, %rsi 353L(CopyVecSizeTail1): 354 bsf %edx, %edx 355L(CopyVecSizeExit): 356 cmp $32, %edx 357 jae L(Exit32_63) 358 cmp $16, %edx 359 jae L(Exit16_31) 360 cmp $8, %edx 361 jae L(Exit8_15) 362 cmp $4, %edx 363 jae L(Exit4_7) 364 cmp $3, %edx 365 je L(Exit3) 366 cmp $1, %edx 367 ja L(Exit2) 368 je L(Exit1) 369 movb $0, (%rdi) 370# ifdef USE_AS_STPCPY 371 lea (%rdi), %rax 372# endif 373# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 374 sub $1, %r8 375 lea 1(%rdi), %rdi 376 jnz L(StrncpyFillTailWithZero) 377# endif 378L(return_vzeroupper): 379 ZERO_UPPER_VEC_REGISTERS_RETURN 380 381 .p2align 4 382L(CopyTwoVecSize1): 383 add $VEC_SIZE, %rsi 384 add $VEC_SIZE, %rdi 385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 386 sub $VEC_SIZE, %r8 387# endif 388 jmp L(CopyVecSizeTail1) 389 390 .p2align 4 391L(CopyTwoVecSize): 392 bsf %edx, %edx 393 add %rcx, %rsi 394 add $VEC_SIZE, %edx 395 sub %ecx, %edx 396 jmp L(CopyVecSizeExit) 397 398 .p2align 4 399L(CopyVecSizeUnaligned_0): 400 bsf %edx, %edx 401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 402# ifdef USE_AS_STPCPY 403 lea (%rdi, %rdx), %rax 404# endif 405 vmovdqu %ymm4, (%rdi) 406 add $((VEC_SIZE * 4) - 1), %r8 407 sub %rdx, %r8 408 lea 1(%rdi, %rdx), %rdi 409 jmp L(StrncpyFillTailWithZero) 410# else 411 jmp L(CopyVecSizeExit) 412# endif 413 414 .p2align 4 415L(CopyVecSizeUnaligned_16): 416 bsf %ecx, %edx 417 vmovdqu %ymm4, (%rdi) 418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 419# ifdef USE_AS_STPCPY 420 lea VEC_SIZE(%rdi, %rdx), %rax 421# endif 422 vmovdqu %ymm5, VEC_SIZE(%rdi) 423 add $((VEC_SIZE * 3) - 1), %r8 424 sub %rdx, %r8 425 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi 426 jmp L(StrncpyFillTailWithZero) 427# else 428 add $VEC_SIZE, %rsi 429 add $VEC_SIZE, %rdi 430 jmp L(CopyVecSizeExit) 431# endif 432 433 .p2align 4 434L(CopyVecSizeUnaligned_32): 435 bsf %edx, %edx 436 vmovdqu %ymm4, (%rdi) 437 vmovdqu %ymm5, VEC_SIZE(%rdi) 438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 439# ifdef USE_AS_STPCPY 440 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax 441# endif 442 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 443 add $((VEC_SIZE * 2) - 1), %r8 444 sub %rdx, %r8 445 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi 446 jmp L(StrncpyFillTailWithZero) 447# else 448 add $(VEC_SIZE * 2), %rsi 449 add $(VEC_SIZE * 2), %rdi 450 jmp L(CopyVecSizeExit) 451# endif 452 453# ifdef USE_AS_STRNCPY 454# ifndef USE_AS_STRCAT 455 .p2align 4 456L(CopyVecSizeUnalignedVec6): 457 vmovdqu %ymm6, (%rdi, %rcx) 458 jmp L(CopyVecSizeVecExit) 459 460 .p2align 4 461L(CopyVecSizeUnalignedVec5): 462 vmovdqu %ymm5, (%rdi, %rcx) 463 jmp L(CopyVecSizeVecExit) 464 465 .p2align 4 466L(CopyVecSizeUnalignedVec4): 467 vmovdqu %ymm4, (%rdi, %rcx) 468 jmp L(CopyVecSizeVecExit) 469 470 .p2align 4 471L(CopyVecSizeUnalignedVec3): 472 vmovdqu %ymm3, (%rdi, %rcx) 473 jmp L(CopyVecSizeVecExit) 474# endif 475 476/* Case2 */ 477 478 .p2align 4 479L(CopyVecSizeCase2): 480 add $VEC_SIZE, %r8 481 add %rcx, %rdi 482 add %rcx, %rsi 483 bsf %edx, %edx 484 cmp %r8d, %edx 485 jb L(CopyVecSizeExit) 486 jmp L(StrncpyExit) 487 488 .p2align 4 489L(CopyTwoVecSizeCase2): 490 add %rcx, %rsi 491 bsf %edx, %edx 492 add $VEC_SIZE, %edx 493 sub %ecx, %edx 494 cmp %r8d, %edx 495 jb L(CopyVecSizeExit) 496 jmp L(StrncpyExit) 497 498L(CopyVecSizeTailCase2): 499 add %rcx, %rsi 500 bsf %edx, %edx 501 cmp %r8d, %edx 502 jb L(CopyVecSizeExit) 503 jmp L(StrncpyExit) 504 505L(CopyVecSizeTail1Case2): 506 bsf %edx, %edx 507 cmp %r8d, %edx 508 jb L(CopyVecSizeExit) 509 jmp L(StrncpyExit) 510 511/* Case2 or Case3, Case3 */ 512 513 .p2align 4 514L(CopyVecSizeCase2OrCase3): 515 test %rdx, %rdx 516 jnz L(CopyVecSizeCase2) 517L(CopyVecSizeCase3): 518 add $VEC_SIZE, %r8 519 add %rcx, %rdi 520 add %rcx, %rsi 521 jmp L(StrncpyExit) 522 523 .p2align 4 524L(CopyTwoVecSizeCase2OrCase3): 525 test %rdx, %rdx 526 jnz L(CopyTwoVecSizeCase2) 527 add %rcx, %rsi 528 jmp L(StrncpyExit) 529 530 .p2align 4 531L(CopyVecSizeTailCase2OrCase3): 532 test %rdx, %rdx 533 jnz L(CopyVecSizeTailCase2) 534 add %rcx, %rsi 535 jmp L(StrncpyExit) 536 537 .p2align 4 538L(CopyTwoVecSize1Case2OrCase3): 539 add $VEC_SIZE, %rdi 540 add $VEC_SIZE, %rsi 541 sub $VEC_SIZE, %r8 542L(CopyVecSizeTail1Case2OrCase3): 543 test %rdx, %rdx 544 jnz L(CopyVecSizeTail1Case2) 545 jmp L(StrncpyExit) 546# endif 547 548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ 549 550 .p2align 4 551L(Exit1): 552 movzwl (%rsi), %edx 553 mov %dx, (%rdi) 554# ifdef USE_AS_STPCPY 555 lea 1(%rdi), %rax 556# endif 557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 558 sub $2, %r8 559 lea 2(%rdi), %rdi 560 jnz L(StrncpyFillTailWithZero) 561# endif 562 VZEROUPPER_RETURN 563 564 .p2align 4 565L(Exit2): 566 movzwl (%rsi), %ecx 567 mov %cx, (%rdi) 568 movb $0, 2(%rdi) 569# ifdef USE_AS_STPCPY 570 lea 2(%rdi), %rax 571# endif 572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 573 sub $3, %r8 574 lea 3(%rdi), %rdi 575 jnz L(StrncpyFillTailWithZero) 576# endif 577 VZEROUPPER_RETURN 578 579 .p2align 4 580L(Exit3): 581 mov (%rsi), %edx 582 mov %edx, (%rdi) 583# ifdef USE_AS_STPCPY 584 lea 3(%rdi), %rax 585# endif 586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 587 sub $4, %r8 588 lea 4(%rdi), %rdi 589 jnz L(StrncpyFillTailWithZero) 590# endif 591 VZEROUPPER_RETURN 592 593 .p2align 4 594L(Exit4_7): 595 mov (%rsi), %ecx 596 mov %ecx, (%rdi) 597 mov -3(%rsi, %rdx), %ecx 598 mov %ecx, -3(%rdi, %rdx) 599# ifdef USE_AS_STPCPY 600 lea (%rdi, %rdx), %rax 601# endif 602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 603 sub %rdx, %r8 604 sub $1, %r8 605 lea 1(%rdi, %rdx), %rdi 606 jnz L(StrncpyFillTailWithZero) 607# endif 608 VZEROUPPER_RETURN 609 610 .p2align 4 611L(Exit8_15): 612 mov (%rsi), %rcx 613 mov -7(%rsi, %rdx), %r9 614 mov %rcx, (%rdi) 615 mov %r9, -7(%rdi, %rdx) 616# ifdef USE_AS_STPCPY 617 lea (%rdi, %rdx), %rax 618# endif 619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 620 sub %rdx, %r8 621 sub $1, %r8 622 lea 1(%rdi, %rdx), %rdi 623 jnz L(StrncpyFillTailWithZero) 624# endif 625 VZEROUPPER_RETURN 626 627 .p2align 4 628L(Exit16_31): 629 vmovdqu (%rsi), %xmm2 630 vmovdqu -15(%rsi, %rdx), %xmm3 631 vmovdqu %xmm2, (%rdi) 632 vmovdqu %xmm3, -15(%rdi, %rdx) 633# ifdef USE_AS_STPCPY 634 lea (%rdi, %rdx), %rax 635# endif 636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 637 sub %rdx, %r8 638 sub $1, %r8 639 lea 1(%rdi, %rdx), %rdi 640 jnz L(StrncpyFillTailWithZero) 641# endif 642 VZEROUPPER_RETURN 643 644 .p2align 4 645L(Exit32_63): 646 vmovdqu (%rsi), %ymm2 647 vmovdqu -31(%rsi, %rdx), %ymm3 648 vmovdqu %ymm2, (%rdi) 649 vmovdqu %ymm3, -31(%rdi, %rdx) 650# ifdef USE_AS_STPCPY 651 lea (%rdi, %rdx), %rax 652# endif 653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 654 sub %rdx, %r8 655 sub $1, %r8 656 lea 1(%rdi, %rdx), %rdi 657 jnz L(StrncpyFillTailWithZero) 658# endif 659 VZEROUPPER_RETURN 660 661# ifdef USE_AS_STRNCPY 662 663 .p2align 4 664L(StrncpyExit1): 665 movzbl (%rsi), %edx 666 mov %dl, (%rdi) 667# ifdef USE_AS_STPCPY 668 lea 1(%rdi), %rax 669# endif 670# ifdef USE_AS_STRCAT 671 movb $0, 1(%rdi) 672# endif 673 VZEROUPPER_RETURN 674 675 .p2align 4 676L(StrncpyExit2): 677 movzwl (%rsi), %edx 678 mov %dx, (%rdi) 679# ifdef USE_AS_STPCPY 680 lea 2(%rdi), %rax 681# endif 682# ifdef USE_AS_STRCAT 683 movb $0, 2(%rdi) 684# endif 685 VZEROUPPER_RETURN 686 687 .p2align 4 688L(StrncpyExit3_4): 689 movzwl (%rsi), %ecx 690 movzwl -2(%rsi, %r8), %edx 691 mov %cx, (%rdi) 692 mov %dx, -2(%rdi, %r8) 693# ifdef USE_AS_STPCPY 694 lea (%rdi, %r8), %rax 695# endif 696# ifdef USE_AS_STRCAT 697 movb $0, (%rdi, %r8) 698# endif 699 VZEROUPPER_RETURN 700 701 .p2align 4 702L(StrncpyExit5_8): 703 mov (%rsi), %ecx 704 mov -4(%rsi, %r8), %edx 705 mov %ecx, (%rdi) 706 mov %edx, -4(%rdi, %r8) 707# ifdef USE_AS_STPCPY 708 lea (%rdi, %r8), %rax 709# endif 710# ifdef USE_AS_STRCAT 711 movb $0, (%rdi, %r8) 712# endif 713 VZEROUPPER_RETURN 714 715 .p2align 4 716L(StrncpyExit9_16): 717 mov (%rsi), %rcx 718 mov -8(%rsi, %r8), %rdx 719 mov %rcx, (%rdi) 720 mov %rdx, -8(%rdi, %r8) 721# ifdef USE_AS_STPCPY 722 lea (%rdi, %r8), %rax 723# endif 724# ifdef USE_AS_STRCAT 725 movb $0, (%rdi, %r8) 726# endif 727 VZEROUPPER_RETURN 728 729 .p2align 4 730L(StrncpyExit17_32): 731 vmovdqu (%rsi), %xmm2 732 vmovdqu -16(%rsi, %r8), %xmm3 733 vmovdqu %xmm2, (%rdi) 734 vmovdqu %xmm3, -16(%rdi, %r8) 735# ifdef USE_AS_STPCPY 736 lea (%rdi, %r8), %rax 737# endif 738# ifdef USE_AS_STRCAT 739 movb $0, (%rdi, %r8) 740# endif 741 VZEROUPPER_RETURN 742 743 .p2align 4 744L(StrncpyExit33_64): 745 /* 0/32, 31/16 */ 746 vmovdqu (%rsi), %ymm2 747 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 748 vmovdqu %ymm2, (%rdi) 749 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) 750# ifdef USE_AS_STPCPY 751 lea (%rdi, %r8), %rax 752# endif 753# ifdef USE_AS_STRCAT 754 movb $0, (%rdi, %r8) 755# endif 756 VZEROUPPER_RETURN 757 758 .p2align 4 759L(StrncpyExit65): 760 /* 0/32, 32/32, 64/1 */ 761 vmovdqu (%rsi), %ymm2 762 vmovdqu 32(%rsi), %ymm3 763 mov 64(%rsi), %cl 764 vmovdqu %ymm2, (%rdi) 765 vmovdqu %ymm3, 32(%rdi) 766 mov %cl, 64(%rdi) 767# ifdef USE_AS_STPCPY 768 lea 65(%rdi), %rax 769# endif 770# ifdef USE_AS_STRCAT 771 movb $0, 65(%rdi) 772# endif 773 VZEROUPPER_RETURN 774 775# ifndef USE_AS_STRCAT 776 777 .p2align 4 778L(Fill1): 779 mov %dl, (%rdi) 780 VZEROUPPER_RETURN 781 782 .p2align 4 783L(Fill2): 784 mov %dx, (%rdi) 785 VZEROUPPER_RETURN 786 787 .p2align 4 788L(Fill3_4): 789 mov %dx, (%rdi) 790 mov %dx, -2(%rdi, %r8) 791 VZEROUPPER_RETURN 792 793 .p2align 4 794L(Fill5_8): 795 mov %edx, (%rdi) 796 mov %edx, -4(%rdi, %r8) 797 VZEROUPPER_RETURN 798 799 .p2align 4 800L(Fill9_16): 801 mov %rdx, (%rdi) 802 mov %rdx, -8(%rdi, %r8) 803 VZEROUPPER_RETURN 804 805 .p2align 4 806L(Fill17_32): 807 vmovdqu %xmmZ, (%rdi) 808 vmovdqu %xmmZ, -16(%rdi, %r8) 809 VZEROUPPER_RETURN 810 811 .p2align 4 812L(CopyVecSizeUnalignedVec2): 813 vmovdqu %ymm2, (%rdi, %rcx) 814 815 .p2align 4 816L(CopyVecSizeVecExit): 817 bsf %edx, %edx 818 add $(VEC_SIZE - 1), %r8 819 add %rcx, %rdi 820# ifdef USE_AS_STPCPY 821 lea (%rdi, %rdx), %rax 822# endif 823 sub %rdx, %r8 824 lea 1(%rdi, %rdx), %rdi 825 826 .p2align 4 827L(StrncpyFillTailWithZero): 828 xor %edx, %edx 829 sub $VEC_SIZE, %r8 830 jbe L(StrncpyFillExit) 831 832 vmovdqu %ymmZ, (%rdi) 833 add $VEC_SIZE, %rdi 834 835 mov %rdi, %rsi 836 and $(VEC_SIZE - 1), %esi 837 sub %rsi, %rdi 838 add %rsi, %r8 839 sub $(VEC_SIZE * 4), %r8 840 jb L(StrncpyFillLessFourVecSize) 841 842L(StrncpyFillLoopVmovdqa): 843 vmovdqa %ymmZ, (%rdi) 844 vmovdqa %ymmZ, VEC_SIZE(%rdi) 845 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) 846 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) 847 add $(VEC_SIZE * 4), %rdi 848 sub $(VEC_SIZE * 4), %r8 849 jae L(StrncpyFillLoopVmovdqa) 850 851L(StrncpyFillLessFourVecSize): 852 add $(VEC_SIZE * 2), %r8 853 jl L(StrncpyFillLessTwoVecSize) 854 vmovdqa %ymmZ, (%rdi) 855 vmovdqa %ymmZ, VEC_SIZE(%rdi) 856 add $(VEC_SIZE * 2), %rdi 857 sub $VEC_SIZE, %r8 858 jl L(StrncpyFillExit) 859 vmovdqa %ymmZ, (%rdi) 860 add $VEC_SIZE, %rdi 861 jmp L(Fill) 862 863 .p2align 4 864L(StrncpyFillLessTwoVecSize): 865 add $VEC_SIZE, %r8 866 jl L(StrncpyFillExit) 867 vmovdqa %ymmZ, (%rdi) 868 add $VEC_SIZE, %rdi 869 jmp L(Fill) 870 871 .p2align 4 872L(StrncpyFillExit): 873 add $VEC_SIZE, %r8 874L(Fill): 875 cmp $17, %r8d 876 jae L(Fill17_32) 877 cmp $9, %r8d 878 jae L(Fill9_16) 879 cmp $5, %r8d 880 jae L(Fill5_8) 881 cmp $3, %r8d 882 jae L(Fill3_4) 883 cmp $1, %r8d 884 ja L(Fill2) 885 je L(Fill1) 886 VZEROUPPER_RETURN 887 888/* end of ifndef USE_AS_STRCAT */ 889# endif 890 891 .p2align 4 892L(UnalignedLeaveCase2OrCase3): 893 test %rdx, %rdx 894 jnz L(UnalignedFourVecSizeLeaveCase2) 895L(UnalignedFourVecSizeLeaveCase3): 896 lea (VEC_SIZE * 4)(%r8), %rcx 897 and $-VEC_SIZE, %rcx 898 add $(VEC_SIZE * 3), %r8 899 jl L(CopyVecSizeCase3) 900 vmovdqu %ymm4, (%rdi) 901 sub $VEC_SIZE, %r8 902 jb L(CopyVecSizeCase3) 903 vmovdqu %ymm5, VEC_SIZE(%rdi) 904 sub $VEC_SIZE, %r8 905 jb L(CopyVecSizeCase3) 906 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 907 sub $VEC_SIZE, %r8 908 jb L(CopyVecSizeCase3) 909 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) 910# ifdef USE_AS_STPCPY 911 lea (VEC_SIZE * 4)(%rdi), %rax 912# endif 913# ifdef USE_AS_STRCAT 914 movb $0, (VEC_SIZE * 4)(%rdi) 915# endif 916 VZEROUPPER_RETURN 917 918 .p2align 4 919L(UnalignedFourVecSizeLeaveCase2): 920 xor %ecx, %ecx 921 vpcmpeqb %ymm4, %ymmZ, %ymmM 922 vpmovmskb %ymmM, %edx 923 add $(VEC_SIZE * 3), %r8 924 jle L(CopyVecSizeCase2OrCase3) 925 test %edx, %edx 926# ifndef USE_AS_STRCAT 927 jnz L(CopyVecSizeUnalignedVec4) 928# else 929 jnz L(CopyVecSize) 930# endif 931 vpcmpeqb %ymm5, %ymmZ, %ymmM 932 vpmovmskb %ymmM, %edx 933 vmovdqu %ymm4, (%rdi) 934 add $VEC_SIZE, %rcx 935 sub $VEC_SIZE, %r8 936 jbe L(CopyVecSizeCase2OrCase3) 937 test %edx, %edx 938# ifndef USE_AS_STRCAT 939 jnz L(CopyVecSizeUnalignedVec5) 940# else 941 jnz L(CopyVecSize) 942# endif 943 944 vpcmpeqb %ymm6, %ymmZ, %ymmM 945 vpmovmskb %ymmM, %edx 946 vmovdqu %ymm5, VEC_SIZE(%rdi) 947 add $VEC_SIZE, %rcx 948 sub $VEC_SIZE, %r8 949 jbe L(CopyVecSizeCase2OrCase3) 950 test %edx, %edx 951# ifndef USE_AS_STRCAT 952 jnz L(CopyVecSizeUnalignedVec6) 953# else 954 jnz L(CopyVecSize) 955# endif 956 957 vpcmpeqb %ymm7, %ymmZ, %ymmM 958 vpmovmskb %ymmM, %edx 959 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 960 lea VEC_SIZE(%rdi, %rcx), %rdi 961 lea VEC_SIZE(%rsi, %rcx), %rsi 962 bsf %edx, %edx 963 cmp %r8d, %edx 964 jb L(CopyVecSizeExit) 965L(StrncpyExit): 966 cmp $65, %r8d 967 je L(StrncpyExit65) 968 cmp $33, %r8d 969 jae L(StrncpyExit33_64) 970 cmp $17, %r8d 971 jae L(StrncpyExit17_32) 972 cmp $9, %r8d 973 jae L(StrncpyExit9_16) 974 cmp $5, %r8d 975 jae L(StrncpyExit5_8) 976 cmp $3, %r8d 977 jae L(StrncpyExit3_4) 978 cmp $1, %r8d 979 ja L(StrncpyExit2) 980 je L(StrncpyExit1) 981# ifdef USE_AS_STPCPY 982 mov %rdi, %rax 983# endif 984# ifdef USE_AS_STRCAT 985 movb $0, (%rdi) 986# endif 987 VZEROUPPER_RETURN 988 989 .p2align 4 990L(ExitZero): 991# ifndef USE_AS_STRCAT 992 mov %rdi, %rax 993# endif 994 VZEROUPPER_RETURN 995 996# endif 997 998# ifndef USE_AS_STRCAT 999END (STRCPY) 1000# else 1001END (STRCAT) 1002# endif 1003#endif 1004