1/* strcpy with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23 24# ifndef USE_AS_STRCAT 25# include <sysdep.h> 26 27# ifndef STRCPY 28# define STRCPY __strcpy_evex 29# endif 30 31# endif 32 33# define VMOVU vmovdqu64 34# define VMOVA vmovdqa64 35 36/* Number of bytes in a vector register */ 37# ifndef VEC_SIZE 38# define VEC_SIZE 32 39# endif 40 41# define XMM2 xmm18 42# define XMM3 xmm19 43 44# define YMM2 ymm18 45# define YMM3 ymm19 46# define YMM4 ymm20 47# define YMM5 ymm21 48# define YMM6 ymm22 49# define YMM7 ymm23 50 51# ifndef USE_AS_STRCAT 52 53/* zero register */ 54# define XMMZERO xmm16 55# define YMMZERO ymm16 56# define YMM1 ymm17 57 58 .section .text.evex,"ax",@progbits 59ENTRY (STRCPY) 60# ifdef USE_AS_STRNCPY 61 mov %RDX_LP, %R8_LP 62 test %R8_LP, %R8_LP 63 jz L(ExitZero) 64# endif 65 mov %rsi, %rcx 66# ifndef USE_AS_STPCPY 67 mov %rdi, %rax /* save result */ 68# endif 69 70 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 71# endif 72 73 and $((VEC_SIZE * 4) - 1), %ecx 74 cmp $(VEC_SIZE * 2), %ecx 75 jbe L(SourceStringAlignmentLessTwoVecSize) 76 77 and $-VEC_SIZE, %rsi 78 and $(VEC_SIZE - 1), %ecx 79 80 vpcmpb $0, (%rsi), %YMMZERO, %k0 81 kmovd %k0, %edx 82 shr %cl, %rdx 83 84# ifdef USE_AS_STRNCPY 85# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 86 mov $VEC_SIZE, %r10 87 sub %rcx, %r10 88 cmp %r10, %r8 89# else 90 mov $(VEC_SIZE + 1), %r10 91 sub %rcx, %r10 92 cmp %r10, %r8 93# endif 94 jbe L(CopyVecSizeTailCase2OrCase3) 95# endif 96 test %edx, %edx 97 jnz L(CopyVecSizeTail) 98 99 vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 100 kmovd %k1, %edx 101 102# ifdef USE_AS_STRNCPY 103 add $VEC_SIZE, %r10 104 cmp %r10, %r8 105 jbe L(CopyTwoVecSizeCase2OrCase3) 106# endif 107 test %edx, %edx 108 jnz L(CopyTwoVecSize) 109 110 VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ 111 VMOVU %YMM2, (%rdi) 112 113/* If source address alignment != destination address alignment */ 114 .p2align 4 115L(UnalignVecSizeBoth): 116 sub %rcx, %rdi 117# ifdef USE_AS_STRNCPY 118 add %rcx, %r8 119 sbb %rcx, %rcx 120 or %rcx, %r8 121# endif 122 mov $VEC_SIZE, %rcx 123 VMOVA (%rsi, %rcx), %YMM2 124 VMOVU %YMM2, (%rdi, %rcx) 125 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 126 vpcmpb $0, %YMM2, %YMMZERO, %k0 127 kmovd %k0, %edx 128 add $VEC_SIZE, %rcx 129# ifdef USE_AS_STRNCPY 130 sub $(VEC_SIZE * 3), %r8 131 jbe L(CopyVecSizeCase2OrCase3) 132# endif 133 test %edx, %edx 134# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 135 jnz L(CopyVecSizeUnalignedVec2) 136# else 137 jnz L(CopyVecSize) 138# endif 139 140 VMOVU %YMM2, (%rdi, %rcx) 141 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 142 vpcmpb $0, %YMM3, %YMMZERO, %k0 143 kmovd %k0, %edx 144 add $VEC_SIZE, %rcx 145# ifdef USE_AS_STRNCPY 146 sub $VEC_SIZE, %r8 147 jbe L(CopyVecSizeCase2OrCase3) 148# endif 149 test %edx, %edx 150# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 151 jnz L(CopyVecSizeUnalignedVec3) 152# else 153 jnz L(CopyVecSize) 154# endif 155 156 VMOVU %YMM3, (%rdi, %rcx) 157 VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 158 vpcmpb $0, %YMM4, %YMMZERO, %k0 159 kmovd %k0, %edx 160 add $VEC_SIZE, %rcx 161# ifdef USE_AS_STRNCPY 162 sub $VEC_SIZE, %r8 163 jbe L(CopyVecSizeCase2OrCase3) 164# endif 165 test %edx, %edx 166# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 167 jnz L(CopyVecSizeUnalignedVec4) 168# else 169 jnz L(CopyVecSize) 170# endif 171 172 VMOVU %YMM4, (%rdi, %rcx) 173 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 174 vpcmpb $0, %YMM2, %YMMZERO, %k0 175 kmovd %k0, %edx 176 add $VEC_SIZE, %rcx 177# ifdef USE_AS_STRNCPY 178 sub $VEC_SIZE, %r8 179 jbe L(CopyVecSizeCase2OrCase3) 180# endif 181 test %edx, %edx 182# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 183 jnz L(CopyVecSizeUnalignedVec2) 184# else 185 jnz L(CopyVecSize) 186# endif 187 188 VMOVU %YMM2, (%rdi, %rcx) 189 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 190 vpcmpb $0, %YMM2, %YMMZERO, %k0 191 kmovd %k0, %edx 192 add $VEC_SIZE, %rcx 193# ifdef USE_AS_STRNCPY 194 sub $VEC_SIZE, %r8 195 jbe L(CopyVecSizeCase2OrCase3) 196# endif 197 test %edx, %edx 198# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 199 jnz L(CopyVecSizeUnalignedVec2) 200# else 201 jnz L(CopyVecSize) 202# endif 203 204 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 205 VMOVU %YMM2, (%rdi, %rcx) 206 vpcmpb $0, %YMM3, %YMMZERO, %k0 207 kmovd %k0, %edx 208 add $VEC_SIZE, %rcx 209# ifdef USE_AS_STRNCPY 210 sub $VEC_SIZE, %r8 211 jbe L(CopyVecSizeCase2OrCase3) 212# endif 213 test %edx, %edx 214# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 215 jnz L(CopyVecSizeUnalignedVec3) 216# else 217 jnz L(CopyVecSize) 218# endif 219 220 VMOVU %YMM3, (%rdi, %rcx) 221 mov %rsi, %rdx 222 lea VEC_SIZE(%rsi, %rcx), %rsi 223 and $-(VEC_SIZE * 4), %rsi 224 sub %rsi, %rdx 225 sub %rdx, %rdi 226# ifdef USE_AS_STRNCPY 227 lea (VEC_SIZE * 8)(%r8, %rdx), %r8 228# endif 229L(UnalignedFourVecSizeLoop): 230 VMOVA (%rsi), %YMM4 231 VMOVA VEC_SIZE(%rsi), %YMM5 232 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 233 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 234 vpminub %YMM5, %YMM4, %YMM2 235 vpminub %YMM7, %YMM6, %YMM3 236 vpminub %YMM2, %YMM3, %YMM2 237 /* If K7 != 0, there is a null byte. */ 238 vpcmpb $0, %YMM2, %YMMZERO, %k7 239 kmovd %k7, %edx 240# ifdef USE_AS_STRNCPY 241 sub $(VEC_SIZE * 4), %r8 242 jbe L(UnalignedLeaveCase2OrCase3) 243# endif 244 test %edx, %edx 245 jnz L(UnalignedFourVecSizeLeave) 246 247L(UnalignedFourVecSizeLoop_start): 248 add $(VEC_SIZE * 4), %rdi 249 add $(VEC_SIZE * 4), %rsi 250 VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) 251 VMOVA (%rsi), %YMM4 252 VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) 253 VMOVA VEC_SIZE(%rsi), %YMM5 254 vpminub %YMM5, %YMM4, %YMM2 255 VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) 256 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 257 VMOVU %YMM7, -VEC_SIZE(%rdi) 258 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 259 vpminub %YMM7, %YMM6, %YMM3 260 vpminub %YMM2, %YMM3, %YMM2 261 /* If K7 != 0, there is a null byte. */ 262 vpcmpb $0, %YMM2, %YMMZERO, %k7 263 kmovd %k7, %edx 264# ifdef USE_AS_STRNCPY 265 sub $(VEC_SIZE * 4), %r8 266 jbe L(UnalignedLeaveCase2OrCase3) 267# endif 268 test %edx, %edx 269 jz L(UnalignedFourVecSizeLoop_start) 270 271L(UnalignedFourVecSizeLeave): 272 vpcmpb $0, %YMM4, %YMMZERO, %k1 273 kmovd %k1, %edx 274 test %edx, %edx 275 jnz L(CopyVecSizeUnaligned_0) 276 277 vpcmpb $0, %YMM5, %YMMZERO, %k2 278 kmovd %k2, %ecx 279 test %ecx, %ecx 280 jnz L(CopyVecSizeUnaligned_16) 281 282 vpcmpb $0, %YMM6, %YMMZERO, %k3 283 kmovd %k3, %edx 284 test %edx, %edx 285 jnz L(CopyVecSizeUnaligned_32) 286 287 vpcmpb $0, %YMM7, %YMMZERO, %k4 288 kmovd %k4, %ecx 289 bsf %ecx, %edx 290 VMOVU %YMM4, (%rdi) 291 VMOVU %YMM5, VEC_SIZE(%rdi) 292 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 293# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 294# ifdef USE_AS_STPCPY 295 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax 296# endif 297 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) 298 add $(VEC_SIZE - 1), %r8 299 sub %rdx, %r8 300 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi 301 jmp L(StrncpyFillTailWithZero) 302# else 303 add $(VEC_SIZE * 3), %rsi 304 add $(VEC_SIZE * 3), %rdi 305 jmp L(CopyVecSizeExit) 306# endif 307 308/* If source address alignment == destination address alignment */ 309 310L(SourceStringAlignmentLessTwoVecSize): 311 VMOVU (%rsi), %YMM3 312 VMOVU VEC_SIZE(%rsi), %YMM2 313 vpcmpb $0, %YMM3, %YMMZERO, %k0 314 kmovd %k0, %edx 315 316# ifdef USE_AS_STRNCPY 317# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 318 cmp $VEC_SIZE, %r8 319# else 320 cmp $(VEC_SIZE + 1), %r8 321# endif 322 jbe L(CopyVecSizeTail1Case2OrCase3) 323# endif 324 test %edx, %edx 325 jnz L(CopyVecSizeTail1) 326 327 VMOVU %YMM3, (%rdi) 328 vpcmpb $0, %YMM2, %YMMZERO, %k0 329 kmovd %k0, %edx 330 331# ifdef USE_AS_STRNCPY 332# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 333 cmp $(VEC_SIZE * 2), %r8 334# else 335 cmp $((VEC_SIZE * 2) + 1), %r8 336# endif 337 jbe L(CopyTwoVecSize1Case2OrCase3) 338# endif 339 test %edx, %edx 340 jnz L(CopyTwoVecSize1) 341 342 and $-VEC_SIZE, %rsi 343 and $(VEC_SIZE - 1), %ecx 344 jmp L(UnalignVecSizeBoth) 345 346/*------End of main part with loops---------------------*/ 347 348/* Case1 */ 349 350# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) 351 .p2align 4 352L(CopyVecSize): 353 add %rcx, %rdi 354# endif 355L(CopyVecSizeTail): 356 add %rcx, %rsi 357L(CopyVecSizeTail1): 358 bsf %edx, %edx 359L(CopyVecSizeExit): 360 cmp $32, %edx 361 jae L(Exit32_63) 362 cmp $16, %edx 363 jae L(Exit16_31) 364 cmp $8, %edx 365 jae L(Exit8_15) 366 cmp $4, %edx 367 jae L(Exit4_7) 368 cmp $3, %edx 369 je L(Exit3) 370 cmp $1, %edx 371 ja L(Exit2) 372 je L(Exit1) 373 movb $0, (%rdi) 374# ifdef USE_AS_STPCPY 375 lea (%rdi), %rax 376# endif 377# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 378 sub $1, %r8 379 lea 1(%rdi), %rdi 380 jnz L(StrncpyFillTailWithZero) 381# endif 382 ret 383 384 .p2align 4 385L(CopyTwoVecSize1): 386 add $VEC_SIZE, %rsi 387 add $VEC_SIZE, %rdi 388# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 389 sub $VEC_SIZE, %r8 390# endif 391 jmp L(CopyVecSizeTail1) 392 393 .p2align 4 394L(CopyTwoVecSize): 395 bsf %edx, %edx 396 add %rcx, %rsi 397 add $VEC_SIZE, %edx 398 sub %ecx, %edx 399 jmp L(CopyVecSizeExit) 400 401 .p2align 4 402L(CopyVecSizeUnaligned_0): 403 bsf %edx, %edx 404# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 405# ifdef USE_AS_STPCPY 406 lea (%rdi, %rdx), %rax 407# endif 408 VMOVU %YMM4, (%rdi) 409 add $((VEC_SIZE * 4) - 1), %r8 410 sub %rdx, %r8 411 lea 1(%rdi, %rdx), %rdi 412 jmp L(StrncpyFillTailWithZero) 413# else 414 jmp L(CopyVecSizeExit) 415# endif 416 417 .p2align 4 418L(CopyVecSizeUnaligned_16): 419 bsf %ecx, %edx 420 VMOVU %YMM4, (%rdi) 421# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 422# ifdef USE_AS_STPCPY 423 lea VEC_SIZE(%rdi, %rdx), %rax 424# endif 425 VMOVU %YMM5, VEC_SIZE(%rdi) 426 add $((VEC_SIZE * 3) - 1), %r8 427 sub %rdx, %r8 428 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi 429 jmp L(StrncpyFillTailWithZero) 430# else 431 add $VEC_SIZE, %rsi 432 add $VEC_SIZE, %rdi 433 jmp L(CopyVecSizeExit) 434# endif 435 436 .p2align 4 437L(CopyVecSizeUnaligned_32): 438 bsf %edx, %edx 439 VMOVU %YMM4, (%rdi) 440 VMOVU %YMM5, VEC_SIZE(%rdi) 441# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 442# ifdef USE_AS_STPCPY 443 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax 444# endif 445 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 446 add $((VEC_SIZE * 2) - 1), %r8 447 sub %rdx, %r8 448 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi 449 jmp L(StrncpyFillTailWithZero) 450# else 451 add $(VEC_SIZE * 2), %rsi 452 add $(VEC_SIZE * 2), %rdi 453 jmp L(CopyVecSizeExit) 454# endif 455 456# ifdef USE_AS_STRNCPY 457# ifndef USE_AS_STRCAT 458 .p2align 4 459L(CopyVecSizeUnalignedVec6): 460 VMOVU %YMM6, (%rdi, %rcx) 461 jmp L(CopyVecSizeVecExit) 462 463 .p2align 4 464L(CopyVecSizeUnalignedVec5): 465 VMOVU %YMM5, (%rdi, %rcx) 466 jmp L(CopyVecSizeVecExit) 467 468 .p2align 4 469L(CopyVecSizeUnalignedVec4): 470 VMOVU %YMM4, (%rdi, %rcx) 471 jmp L(CopyVecSizeVecExit) 472 473 .p2align 4 474L(CopyVecSizeUnalignedVec3): 475 VMOVU %YMM3, (%rdi, %rcx) 476 jmp L(CopyVecSizeVecExit) 477# endif 478 479/* Case2 */ 480 481 .p2align 4 482L(CopyVecSizeCase2): 483 add $VEC_SIZE, %r8 484 add %rcx, %rdi 485 add %rcx, %rsi 486 bsf %edx, %edx 487 cmp %r8d, %edx 488 jb L(CopyVecSizeExit) 489 jmp L(StrncpyExit) 490 491 .p2align 4 492L(CopyTwoVecSizeCase2): 493 add %rcx, %rsi 494 bsf %edx, %edx 495 add $VEC_SIZE, %edx 496 sub %ecx, %edx 497 cmp %r8d, %edx 498 jb L(CopyVecSizeExit) 499 jmp L(StrncpyExit) 500 501L(CopyVecSizeTailCase2): 502 add %rcx, %rsi 503 bsf %edx, %edx 504 cmp %r8d, %edx 505 jb L(CopyVecSizeExit) 506 jmp L(StrncpyExit) 507 508L(CopyVecSizeTail1Case2): 509 bsf %edx, %edx 510 cmp %r8d, %edx 511 jb L(CopyVecSizeExit) 512 jmp L(StrncpyExit) 513 514/* Case2 or Case3, Case3 */ 515 516 .p2align 4 517L(CopyVecSizeCase2OrCase3): 518 test %rdx, %rdx 519 jnz L(CopyVecSizeCase2) 520L(CopyVecSizeCase3): 521 add $VEC_SIZE, %r8 522 add %rcx, %rdi 523 add %rcx, %rsi 524 jmp L(StrncpyExit) 525 526 .p2align 4 527L(CopyTwoVecSizeCase2OrCase3): 528 test %rdx, %rdx 529 jnz L(CopyTwoVecSizeCase2) 530 add %rcx, %rsi 531 jmp L(StrncpyExit) 532 533 .p2align 4 534L(CopyVecSizeTailCase2OrCase3): 535 test %rdx, %rdx 536 jnz L(CopyVecSizeTailCase2) 537 add %rcx, %rsi 538 jmp L(StrncpyExit) 539 540 .p2align 4 541L(CopyTwoVecSize1Case2OrCase3): 542 add $VEC_SIZE, %rdi 543 add $VEC_SIZE, %rsi 544 sub $VEC_SIZE, %r8 545L(CopyVecSizeTail1Case2OrCase3): 546 test %rdx, %rdx 547 jnz L(CopyVecSizeTail1Case2) 548 jmp L(StrncpyExit) 549# endif 550 551/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ 552 553 .p2align 4 554L(Exit1): 555 movzwl (%rsi), %edx 556 mov %dx, (%rdi) 557# ifdef USE_AS_STPCPY 558 lea 1(%rdi), %rax 559# endif 560# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 561 sub $2, %r8 562 lea 2(%rdi), %rdi 563 jnz L(StrncpyFillTailWithZero) 564# endif 565 ret 566 567 .p2align 4 568L(Exit2): 569 movzwl (%rsi), %ecx 570 mov %cx, (%rdi) 571 movb $0, 2(%rdi) 572# ifdef USE_AS_STPCPY 573 lea 2(%rdi), %rax 574# endif 575# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 576 sub $3, %r8 577 lea 3(%rdi), %rdi 578 jnz L(StrncpyFillTailWithZero) 579# endif 580 ret 581 582 .p2align 4 583L(Exit3): 584 mov (%rsi), %edx 585 mov %edx, (%rdi) 586# ifdef USE_AS_STPCPY 587 lea 3(%rdi), %rax 588# endif 589# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 590 sub $4, %r8 591 lea 4(%rdi), %rdi 592 jnz L(StrncpyFillTailWithZero) 593# endif 594 ret 595 596 .p2align 4 597L(Exit4_7): 598 mov (%rsi), %ecx 599 mov %ecx, (%rdi) 600 mov -3(%rsi, %rdx), %ecx 601 mov %ecx, -3(%rdi, %rdx) 602# ifdef USE_AS_STPCPY 603 lea (%rdi, %rdx), %rax 604# endif 605# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 606 sub %rdx, %r8 607 sub $1, %r8 608 lea 1(%rdi, %rdx), %rdi 609 jnz L(StrncpyFillTailWithZero) 610# endif 611 ret 612 613 .p2align 4 614L(Exit8_15): 615 mov (%rsi), %rcx 616 mov -7(%rsi, %rdx), %r9 617 mov %rcx, (%rdi) 618 mov %r9, -7(%rdi, %rdx) 619# ifdef USE_AS_STPCPY 620 lea (%rdi, %rdx), %rax 621# endif 622# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 623 sub %rdx, %r8 624 sub $1, %r8 625 lea 1(%rdi, %rdx), %rdi 626 jnz L(StrncpyFillTailWithZero) 627# endif 628 ret 629 630 .p2align 4 631L(Exit16_31): 632 VMOVU (%rsi), %XMM2 633 VMOVU -15(%rsi, %rdx), %XMM3 634 VMOVU %XMM2, (%rdi) 635 VMOVU %XMM3, -15(%rdi, %rdx) 636# ifdef USE_AS_STPCPY 637 lea (%rdi, %rdx), %rax 638# endif 639# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 640 sub %rdx, %r8 641 sub $1, %r8 642 lea 1(%rdi, %rdx), %rdi 643 jnz L(StrncpyFillTailWithZero) 644# endif 645 ret 646 647 .p2align 4 648L(Exit32_63): 649 VMOVU (%rsi), %YMM2 650 VMOVU -31(%rsi, %rdx), %YMM3 651 VMOVU %YMM2, (%rdi) 652 VMOVU %YMM3, -31(%rdi, %rdx) 653# ifdef USE_AS_STPCPY 654 lea (%rdi, %rdx), %rax 655# endif 656# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 657 sub %rdx, %r8 658 sub $1, %r8 659 lea 1(%rdi, %rdx), %rdi 660 jnz L(StrncpyFillTailWithZero) 661# endif 662 ret 663 664# ifdef USE_AS_STRNCPY 665 666 .p2align 4 667L(StrncpyExit1): 668 movzbl (%rsi), %edx 669 mov %dl, (%rdi) 670# ifdef USE_AS_STPCPY 671 lea 1(%rdi), %rax 672# endif 673# ifdef USE_AS_STRCAT 674 movb $0, 1(%rdi) 675# endif 676 ret 677 678 .p2align 4 679L(StrncpyExit2): 680 movzwl (%rsi), %edx 681 mov %dx, (%rdi) 682# ifdef USE_AS_STPCPY 683 lea 2(%rdi), %rax 684# endif 685# ifdef USE_AS_STRCAT 686 movb $0, 2(%rdi) 687# endif 688 ret 689 690 .p2align 4 691L(StrncpyExit3_4): 692 movzwl (%rsi), %ecx 693 movzwl -2(%rsi, %r8), %edx 694 mov %cx, (%rdi) 695 mov %dx, -2(%rdi, %r8) 696# ifdef USE_AS_STPCPY 697 lea (%rdi, %r8), %rax 698# endif 699# ifdef USE_AS_STRCAT 700 movb $0, (%rdi, %r8) 701# endif 702 ret 703 704 .p2align 4 705L(StrncpyExit5_8): 706 mov (%rsi), %ecx 707 mov -4(%rsi, %r8), %edx 708 mov %ecx, (%rdi) 709 mov %edx, -4(%rdi, %r8) 710# ifdef USE_AS_STPCPY 711 lea (%rdi, %r8), %rax 712# endif 713# ifdef USE_AS_STRCAT 714 movb $0, (%rdi, %r8) 715# endif 716 ret 717 718 .p2align 4 719L(StrncpyExit9_16): 720 mov (%rsi), %rcx 721 mov -8(%rsi, %r8), %rdx 722 mov %rcx, (%rdi) 723 mov %rdx, -8(%rdi, %r8) 724# ifdef USE_AS_STPCPY 725 lea (%rdi, %r8), %rax 726# endif 727# ifdef USE_AS_STRCAT 728 movb $0, (%rdi, %r8) 729# endif 730 ret 731 732 .p2align 4 733L(StrncpyExit17_32): 734 VMOVU (%rsi), %XMM2 735 VMOVU -16(%rsi, %r8), %XMM3 736 VMOVU %XMM2, (%rdi) 737 VMOVU %XMM3, -16(%rdi, %r8) 738# ifdef USE_AS_STPCPY 739 lea (%rdi, %r8), %rax 740# endif 741# ifdef USE_AS_STRCAT 742 movb $0, (%rdi, %r8) 743# endif 744 ret 745 746 .p2align 4 747L(StrncpyExit33_64): 748 /* 0/32, 31/16 */ 749 VMOVU (%rsi), %YMM2 750 VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 751 VMOVU %YMM2, (%rdi) 752 VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) 753# ifdef USE_AS_STPCPY 754 lea (%rdi, %r8), %rax 755# endif 756# ifdef USE_AS_STRCAT 757 movb $0, (%rdi, %r8) 758# endif 759 ret 760 761 .p2align 4 762L(StrncpyExit65): 763 /* 0/32, 32/32, 64/1 */ 764 VMOVU (%rsi), %YMM2 765 VMOVU 32(%rsi), %YMM3 766 mov 64(%rsi), %cl 767 VMOVU %YMM2, (%rdi) 768 VMOVU %YMM3, 32(%rdi) 769 mov %cl, 64(%rdi) 770# ifdef USE_AS_STPCPY 771 lea 65(%rdi), %rax 772# endif 773# ifdef USE_AS_STRCAT 774 movb $0, 65(%rdi) 775# endif 776 ret 777 778# ifndef USE_AS_STRCAT 779 780 .p2align 4 781L(Fill1): 782 mov %dl, (%rdi) 783 ret 784 785 .p2align 4 786L(Fill2): 787 mov %dx, (%rdi) 788 ret 789 790 .p2align 4 791L(Fill3_4): 792 mov %dx, (%rdi) 793 mov %dx, -2(%rdi, %r8) 794 ret 795 796 .p2align 4 797L(Fill5_8): 798 mov %edx, (%rdi) 799 mov %edx, -4(%rdi, %r8) 800 ret 801 802 .p2align 4 803L(Fill9_16): 804 mov %rdx, (%rdi) 805 mov %rdx, -8(%rdi, %r8) 806 ret 807 808 .p2align 4 809L(Fill17_32): 810 VMOVU %XMMZERO, (%rdi) 811 VMOVU %XMMZERO, -16(%rdi, %r8) 812 ret 813 814 .p2align 4 815L(CopyVecSizeUnalignedVec2): 816 VMOVU %YMM2, (%rdi, %rcx) 817 818 .p2align 4 819L(CopyVecSizeVecExit): 820 bsf %edx, %edx 821 add $(VEC_SIZE - 1), %r8 822 add %rcx, %rdi 823# ifdef USE_AS_STPCPY 824 lea (%rdi, %rdx), %rax 825# endif 826 sub %rdx, %r8 827 lea 1(%rdi, %rdx), %rdi 828 829 .p2align 4 830L(StrncpyFillTailWithZero): 831 xor %edx, %edx 832 sub $VEC_SIZE, %r8 833 jbe L(StrncpyFillExit) 834 835 VMOVU %YMMZERO, (%rdi) 836 add $VEC_SIZE, %rdi 837 838 mov %rdi, %rsi 839 and $(VEC_SIZE - 1), %esi 840 sub %rsi, %rdi 841 add %rsi, %r8 842 sub $(VEC_SIZE * 4), %r8 843 jb L(StrncpyFillLessFourVecSize) 844 845L(StrncpyFillLoopVmovdqa): 846 VMOVA %YMMZERO, (%rdi) 847 VMOVA %YMMZERO, VEC_SIZE(%rdi) 848 VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) 849 VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) 850 add $(VEC_SIZE * 4), %rdi 851 sub $(VEC_SIZE * 4), %r8 852 jae L(StrncpyFillLoopVmovdqa) 853 854L(StrncpyFillLessFourVecSize): 855 add $(VEC_SIZE * 2), %r8 856 jl L(StrncpyFillLessTwoVecSize) 857 VMOVA %YMMZERO, (%rdi) 858 VMOVA %YMMZERO, VEC_SIZE(%rdi) 859 add $(VEC_SIZE * 2), %rdi 860 sub $VEC_SIZE, %r8 861 jl L(StrncpyFillExit) 862 VMOVA %YMMZERO, (%rdi) 863 add $VEC_SIZE, %rdi 864 jmp L(Fill) 865 866 .p2align 4 867L(StrncpyFillLessTwoVecSize): 868 add $VEC_SIZE, %r8 869 jl L(StrncpyFillExit) 870 VMOVA %YMMZERO, (%rdi) 871 add $VEC_SIZE, %rdi 872 jmp L(Fill) 873 874 .p2align 4 875L(StrncpyFillExit): 876 add $VEC_SIZE, %r8 877L(Fill): 878 cmp $17, %r8d 879 jae L(Fill17_32) 880 cmp $9, %r8d 881 jae L(Fill9_16) 882 cmp $5, %r8d 883 jae L(Fill5_8) 884 cmp $3, %r8d 885 jae L(Fill3_4) 886 cmp $1, %r8d 887 ja L(Fill2) 888 je L(Fill1) 889 ret 890 891/* end of ifndef USE_AS_STRCAT */ 892# endif 893 894 .p2align 4 895L(UnalignedLeaveCase2OrCase3): 896 test %rdx, %rdx 897 jnz L(UnalignedFourVecSizeLeaveCase2) 898L(UnalignedFourVecSizeLeaveCase3): 899 lea (VEC_SIZE * 4)(%r8), %rcx 900 and $-VEC_SIZE, %rcx 901 add $(VEC_SIZE * 3), %r8 902 jl L(CopyVecSizeCase3) 903 VMOVU %YMM4, (%rdi) 904 sub $VEC_SIZE, %r8 905 jb L(CopyVecSizeCase3) 906 VMOVU %YMM5, VEC_SIZE(%rdi) 907 sub $VEC_SIZE, %r8 908 jb L(CopyVecSizeCase3) 909 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 910 sub $VEC_SIZE, %r8 911 jb L(CopyVecSizeCase3) 912 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) 913# ifdef USE_AS_STPCPY 914 lea (VEC_SIZE * 4)(%rdi), %rax 915# endif 916# ifdef USE_AS_STRCAT 917 movb $0, (VEC_SIZE * 4)(%rdi) 918# endif 919 ret 920 921 .p2align 4 922L(UnalignedFourVecSizeLeaveCase2): 923 xor %ecx, %ecx 924 vpcmpb $0, %YMM4, %YMMZERO, %k1 925 kmovd %k1, %edx 926 add $(VEC_SIZE * 3), %r8 927 jle L(CopyVecSizeCase2OrCase3) 928 test %edx, %edx 929# ifndef USE_AS_STRCAT 930 jnz L(CopyVecSizeUnalignedVec4) 931# else 932 jnz L(CopyVecSize) 933# endif 934 vpcmpb $0, %YMM5, %YMMZERO, %k2 935 kmovd %k2, %edx 936 VMOVU %YMM4, (%rdi) 937 add $VEC_SIZE, %rcx 938 sub $VEC_SIZE, %r8 939 jbe L(CopyVecSizeCase2OrCase3) 940 test %edx, %edx 941# ifndef USE_AS_STRCAT 942 jnz L(CopyVecSizeUnalignedVec5) 943# else 944 jnz L(CopyVecSize) 945# endif 946 947 vpcmpb $0, %YMM6, %YMMZERO, %k3 948 kmovd %k3, %edx 949 VMOVU %YMM5, VEC_SIZE(%rdi) 950 add $VEC_SIZE, %rcx 951 sub $VEC_SIZE, %r8 952 jbe L(CopyVecSizeCase2OrCase3) 953 test %edx, %edx 954# ifndef USE_AS_STRCAT 955 jnz L(CopyVecSizeUnalignedVec6) 956# else 957 jnz L(CopyVecSize) 958# endif 959 960 vpcmpb $0, %YMM7, %YMMZERO, %k4 961 kmovd %k4, %edx 962 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 963 lea VEC_SIZE(%rdi, %rcx), %rdi 964 lea VEC_SIZE(%rsi, %rcx), %rsi 965 bsf %edx, %edx 966 cmp %r8d, %edx 967 jb L(CopyVecSizeExit) 968L(StrncpyExit): 969 cmp $65, %r8d 970 je L(StrncpyExit65) 971 cmp $33, %r8d 972 jae L(StrncpyExit33_64) 973 cmp $17, %r8d 974 jae L(StrncpyExit17_32) 975 cmp $9, %r8d 976 jae L(StrncpyExit9_16) 977 cmp $5, %r8d 978 jae L(StrncpyExit5_8) 979 cmp $3, %r8d 980 jae L(StrncpyExit3_4) 981 cmp $1, %r8d 982 ja L(StrncpyExit2) 983 je L(StrncpyExit1) 984# ifdef USE_AS_STPCPY 985 mov %rdi, %rax 986# endif 987# ifdef USE_AS_STRCAT 988 movb $0, (%rdi) 989# endif 990 ret 991 992 .p2align 4 993L(ExitZero): 994# ifndef USE_AS_STRCAT 995 mov %rdi, %rax 996# endif 997 ret 998 999# endif 1000 1001# ifndef USE_AS_STRCAT 1002END (STRCPY) 1003# else 1004END (STRCAT) 1005# endif 1006#endif 1007