1/* wcscpy with SSSE3 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20# include <sysdep.h> 21 22# define CFI_PUSH(REG) \ 23 cfi_adjust_cfa_offset (4); \ 24 cfi_rel_offset (REG, 0) 25 26# define CFI_POP(REG) \ 27 cfi_adjust_cfa_offset (-4); \ 28 cfi_restore (REG) 29 30# define PUSH(REG) pushl REG; CFI_PUSH (REG) 31# define POP(REG) popl REG; CFI_POP (REG) 32 33# define PARMS 4 34# define RETURN POP (%edi); ret; CFI_PUSH (%edi) 35# define STR1 PARMS 36# define STR2 STR1+4 37# define LEN STR2+4 38 39 atom_text_section 40ENTRY (__wcscpy_ssse3) 41 mov STR1(%esp), %edx 42 mov STR2(%esp), %ecx 43 44 cmp $0, (%ecx) 45 jz L(ExitTail4) 46 cmp $0, 4(%ecx) 47 jz L(ExitTail8) 48 cmp $0, 8(%ecx) 49 jz L(ExitTail12) 50 cmp $0, 12(%ecx) 51 jz L(ExitTail16) 52 53 PUSH (%edi) 54 mov %edx, %edi 55 PUSH (%esi) 56 lea 16(%ecx), %esi 57 58 and $-16, %esi 59 60 pxor %xmm0, %xmm0 61 pcmpeqd (%esi), %xmm0 62 movdqu (%ecx), %xmm1 63 movdqu %xmm1, (%edx) 64 65 pmovmskb %xmm0, %eax 66 sub %ecx, %esi 67 68 test %eax, %eax 69 jnz L(CopyFrom1To16Bytes) 70 71 mov %edx, %eax 72 lea 16(%edx), %edx 73 and $-16, %edx 74 sub %edx, %eax 75 76 sub %eax, %ecx 77 mov %ecx, %eax 78 and $0xf, %eax 79 mov $0, %esi 80 81 jz L(Align16Both) 82 cmp $4, %eax 83 je L(Shl4) 84 cmp $8, %eax 85 je L(Shl8) 86 jmp L(Shl12) 87 88L(Align16Both): 89 movaps (%ecx), %xmm1 90 movaps 16(%ecx), %xmm2 91 movaps %xmm1, (%edx) 92 pcmpeqd %xmm2, %xmm0 93 pmovmskb %xmm0, %eax 94 lea 16(%esi), %esi 95 96 test %eax, %eax 97 jnz L(CopyFrom1To16Bytes) 98 99 movaps 16(%ecx, %esi), %xmm3 100 movaps %xmm2, (%edx, %esi) 101 pcmpeqd %xmm3, %xmm0 102 pmovmskb %xmm0, %eax 103 lea 16(%esi), %esi 104 105 test %eax, %eax 106 jnz L(CopyFrom1To16Bytes) 107 108 movaps 16(%ecx, %esi), %xmm4 109 movaps %xmm3, (%edx, %esi) 110 pcmpeqd %xmm4, %xmm0 111 pmovmskb %xmm0, %eax 112 lea 16(%esi), %esi 113 114 test %eax, %eax 115 jnz L(CopyFrom1To16Bytes) 116 117 movaps 16(%ecx, %esi), %xmm1 118 movaps %xmm4, (%edx, %esi) 119 pcmpeqd %xmm1, %xmm0 120 pmovmskb %xmm0, %eax 121 lea 16(%esi), %esi 122 123 test %eax, %eax 124 jnz L(CopyFrom1To16Bytes) 125 126 movaps 16(%ecx, %esi), %xmm2 127 movaps %xmm1, (%edx, %esi) 128 pcmpeqd %xmm2, %xmm0 129 pmovmskb %xmm0, %eax 130 lea 16(%esi), %esi 131 132 test %eax, %eax 133 jnz L(CopyFrom1To16Bytes) 134 135 movaps 16(%ecx, %esi), %xmm3 136 movaps %xmm2, (%edx, %esi) 137 pcmpeqd %xmm3, %xmm0 138 pmovmskb %xmm0, %eax 139 lea 16(%esi), %esi 140 141 test %eax, %eax 142 jnz L(CopyFrom1To16Bytes) 143 144 movaps %xmm3, (%edx, %esi) 145 mov %ecx, %eax 146 lea 16(%ecx, %esi), %ecx 147 and $-0x40, %ecx 148 sub %ecx, %eax 149 sub %eax, %edx 150 151 mov $-0x40, %esi 152 153L(Aligned64Loop): 154 movaps (%ecx), %xmm2 155 movaps 32(%ecx), %xmm3 156 movaps %xmm2, %xmm4 157 movaps 16(%ecx), %xmm5 158 movaps %xmm3, %xmm6 159 movaps 48(%ecx), %xmm7 160 pminub %xmm5, %xmm2 161 pminub %xmm7, %xmm3 162 pminub %xmm2, %xmm3 163 lea 64(%edx), %edx 164 pcmpeqd %xmm0, %xmm3 165 lea 64(%ecx), %ecx 166 pmovmskb %xmm3, %eax 167 168 test %eax, %eax 169 jnz L(Aligned64Leave) 170 movaps %xmm4, -64(%edx) 171 movaps %xmm5, -48(%edx) 172 movaps %xmm6, -32(%edx) 173 movaps %xmm7, -16(%edx) 174 jmp L(Aligned64Loop) 175 176L(Aligned64Leave): 177 pcmpeqd %xmm4, %xmm0 178 pmovmskb %xmm0, %eax 179 test %eax, %eax 180 jnz L(CopyFrom1To16Bytes) 181 182 pcmpeqd %xmm5, %xmm0 183 pmovmskb %xmm0, %eax 184 movaps %xmm4, -64(%edx) 185 test %eax, %eax 186 lea 16(%esi), %esi 187 jnz L(CopyFrom1To16Bytes) 188 189 pcmpeqd %xmm6, %xmm0 190 pmovmskb %xmm0, %eax 191 movaps %xmm5, -48(%edx) 192 test %eax, %eax 193 lea 16(%esi), %esi 194 jnz L(CopyFrom1To16Bytes) 195 196 movaps %xmm6, -32(%edx) 197 pcmpeqd %xmm7, %xmm0 198 pmovmskb %xmm0, %eax 199 test %eax, %eax 200 lea 16(%esi), %esi 201 jnz L(CopyFrom1To16Bytes) 202 203 mov $-0x40, %esi 204 movaps %xmm7, -16(%edx) 205 jmp L(Aligned64Loop) 206 207 .p2align 4 208L(Shl4): 209 movaps -4(%ecx), %xmm1 210 movaps 12(%ecx), %xmm2 211L(Shl4Start): 212 pcmpeqd %xmm2, %xmm0 213 pmovmskb %xmm0, %eax 214 movaps %xmm2, %xmm3 215 216 test %eax, %eax 217 jnz L(Shl4LoopExit) 218 219 palignr $4, %xmm1, %xmm2 220 movaps %xmm2, (%edx) 221 movaps 28(%ecx), %xmm2 222 223 pcmpeqd %xmm2, %xmm0 224 lea 16(%edx), %edx 225 pmovmskb %xmm0, %eax 226 lea 16(%ecx), %ecx 227 movaps %xmm2, %xmm1 228 229 test %eax, %eax 230 jnz L(Shl4LoopExit) 231 232 palignr $4, %xmm3, %xmm2 233 movaps %xmm2, (%edx) 234 movaps 28(%ecx), %xmm2 235 236 pcmpeqd %xmm2, %xmm0 237 lea 16(%edx), %edx 238 pmovmskb %xmm0, %eax 239 lea 16(%ecx), %ecx 240 movaps %xmm2, %xmm3 241 242 test %eax, %eax 243 jnz L(Shl4LoopExit) 244 245 palignr $4, %xmm1, %xmm2 246 movaps %xmm2, (%edx) 247 movaps 28(%ecx), %xmm2 248 249 pcmpeqd %xmm2, %xmm0 250 lea 16(%edx), %edx 251 pmovmskb %xmm0, %eax 252 lea 16(%ecx), %ecx 253 254 test %eax, %eax 255 jnz L(Shl4LoopExit) 256 257 palignr $4, %xmm3, %xmm2 258 movaps %xmm2, (%edx) 259 lea 28(%ecx), %ecx 260 lea 16(%edx), %edx 261 262 mov %ecx, %eax 263 and $-0x40, %ecx 264 sub %ecx, %eax 265 lea -12(%ecx), %ecx 266 sub %eax, %edx 267 268 movaps -4(%ecx), %xmm1 269 270L(Shl4LoopStart): 271 movaps 12(%ecx), %xmm2 272 movaps 28(%ecx), %xmm3 273 movaps %xmm3, %xmm6 274 movaps 44(%ecx), %xmm4 275 movaps %xmm4, %xmm7 276 movaps 60(%ecx), %xmm5 277 pminub %xmm2, %xmm6 278 pminub %xmm5, %xmm7 279 pminub %xmm6, %xmm7 280 pcmpeqd %xmm0, %xmm7 281 pmovmskb %xmm7, %eax 282 movaps %xmm5, %xmm7 283 palignr $4, %xmm4, %xmm5 284 test %eax, %eax 285 palignr $4, %xmm3, %xmm4 286 jnz L(Shl4Start) 287 288 palignr $4, %xmm2, %xmm3 289 lea 64(%ecx), %ecx 290 palignr $4, %xmm1, %xmm2 291 movaps %xmm7, %xmm1 292 movaps %xmm5, 48(%edx) 293 movaps %xmm4, 32(%edx) 294 movaps %xmm3, 16(%edx) 295 movaps %xmm2, (%edx) 296 lea 64(%edx), %edx 297 jmp L(Shl4LoopStart) 298 299L(Shl4LoopExit): 300 movlpd (%ecx), %xmm0 301 movl 8(%ecx), %esi 302 movlpd %xmm0, (%edx) 303 movl %esi, 8(%edx) 304 POP (%esi) 305 add $12, %edx 306 add $12, %ecx 307 test %al, %al 308 jz L(ExitHigh) 309 test $0x01, %al 310 jnz L(Exit4) 311 movlpd (%ecx), %xmm0 312 movlpd %xmm0, (%edx) 313 movl %edi, %eax 314 RETURN 315 316 CFI_PUSH (%esi) 317 318 .p2align 4 319L(Shl8): 320 movaps -8(%ecx), %xmm1 321 movaps 8(%ecx), %xmm2 322L(Shl8Start): 323 pcmpeqd %xmm2, %xmm0 324 pmovmskb %xmm0, %eax 325 movaps %xmm2, %xmm3 326 327 test %eax, %eax 328 jnz L(Shl8LoopExit) 329 330 palignr $8, %xmm1, %xmm2 331 movaps %xmm2, (%edx) 332 movaps 24(%ecx), %xmm2 333 334 pcmpeqd %xmm2, %xmm0 335 lea 16(%edx), %edx 336 pmovmskb %xmm0, %eax 337 lea 16(%ecx), %ecx 338 movaps %xmm2, %xmm1 339 340 test %eax, %eax 341 jnz L(Shl8LoopExit) 342 343 palignr $8, %xmm3, %xmm2 344 movaps %xmm2, (%edx) 345 movaps 24(%ecx), %xmm2 346 347 pcmpeqd %xmm2, %xmm0 348 lea 16(%edx), %edx 349 pmovmskb %xmm0, %eax 350 lea 16(%ecx), %ecx 351 movaps %xmm2, %xmm3 352 353 test %eax, %eax 354 jnz L(Shl8LoopExit) 355 356 palignr $8, %xmm1, %xmm2 357 movaps %xmm2, (%edx) 358 movaps 24(%ecx), %xmm2 359 360 pcmpeqd %xmm2, %xmm0 361 lea 16(%edx), %edx 362 pmovmskb %xmm0, %eax 363 lea 16(%ecx), %ecx 364 365 test %eax, %eax 366 jnz L(Shl8LoopExit) 367 368 palignr $8, %xmm3, %xmm2 369 movaps %xmm2, (%edx) 370 lea 24(%ecx), %ecx 371 lea 16(%edx), %edx 372 373 mov %ecx, %eax 374 and $-0x40, %ecx 375 sub %ecx, %eax 376 lea -8(%ecx), %ecx 377 sub %eax, %edx 378 379 movaps -8(%ecx), %xmm1 380 381L(Shl8LoopStart): 382 movaps 8(%ecx), %xmm2 383 movaps 24(%ecx), %xmm3 384 movaps %xmm3, %xmm6 385 movaps 40(%ecx), %xmm4 386 movaps %xmm4, %xmm7 387 movaps 56(%ecx), %xmm5 388 pminub %xmm2, %xmm6 389 pminub %xmm5, %xmm7 390 pminub %xmm6, %xmm7 391 pcmpeqd %xmm0, %xmm7 392 pmovmskb %xmm7, %eax 393 movaps %xmm5, %xmm7 394 palignr $8, %xmm4, %xmm5 395 test %eax, %eax 396 palignr $8, %xmm3, %xmm4 397 jnz L(Shl8Start) 398 399 palignr $8, %xmm2, %xmm3 400 lea 64(%ecx), %ecx 401 palignr $8, %xmm1, %xmm2 402 movaps %xmm7, %xmm1 403 movaps %xmm5, 48(%edx) 404 movaps %xmm4, 32(%edx) 405 movaps %xmm3, 16(%edx) 406 movaps %xmm2, (%edx) 407 lea 64(%edx), %edx 408 jmp L(Shl8LoopStart) 409 410L(Shl8LoopExit): 411 movlpd (%ecx), %xmm0 412 movlpd %xmm0, (%edx) 413 POP (%esi) 414 add $8, %edx 415 add $8, %ecx 416 test %al, %al 417 jz L(ExitHigh) 418 test $0x01, %al 419 jnz L(Exit4) 420 movlpd (%ecx), %xmm0 421 movlpd %xmm0, (%edx) 422 movl %edi, %eax 423 RETURN 424 425 CFI_PUSH (%esi) 426 427 .p2align 4 428L(Shl12): 429 movaps -12(%ecx), %xmm1 430 movaps 4(%ecx), %xmm2 431L(Shl12Start): 432 pcmpeqd %xmm2, %xmm0 433 pmovmskb %xmm0, %eax 434 movaps %xmm2, %xmm3 435 436 test %eax, %eax 437 jnz L(Shl12LoopExit) 438 439 palignr $12, %xmm1, %xmm2 440 movaps %xmm2, (%edx) 441 movaps 20(%ecx), %xmm2 442 443 pcmpeqd %xmm2, %xmm0 444 lea 16(%edx), %edx 445 pmovmskb %xmm0, %eax 446 lea 16(%ecx), %ecx 447 movaps %xmm2, %xmm1 448 449 test %eax, %eax 450 jnz L(Shl12LoopExit) 451 452 palignr $12, %xmm3, %xmm2 453 movaps %xmm2, (%edx) 454 movaps 20(%ecx), %xmm2 455 456 pcmpeqd %xmm2, %xmm0 457 lea 16(%edx), %edx 458 pmovmskb %xmm0, %eax 459 lea 16(%ecx), %ecx 460 movaps %xmm2, %xmm3 461 462 test %eax, %eax 463 jnz L(Shl12LoopExit) 464 465 palignr $12, %xmm1, %xmm2 466 movaps %xmm2, (%edx) 467 movaps 20(%ecx), %xmm2 468 469 pcmpeqd %xmm2, %xmm0 470 lea 16(%edx), %edx 471 pmovmskb %xmm0, %eax 472 lea 16(%ecx), %ecx 473 474 test %eax, %eax 475 jnz L(Shl12LoopExit) 476 477 palignr $12, %xmm3, %xmm2 478 movaps %xmm2, (%edx) 479 lea 20(%ecx), %ecx 480 lea 16(%edx), %edx 481 482 mov %ecx, %eax 483 and $-0x40, %ecx 484 sub %ecx, %eax 485 lea -4(%ecx), %ecx 486 sub %eax, %edx 487 488 movaps -12(%ecx), %xmm1 489 490L(Shl12LoopStart): 491 movaps 4(%ecx), %xmm2 492 movaps 20(%ecx), %xmm3 493 movaps %xmm3, %xmm6 494 movaps 36(%ecx), %xmm4 495 movaps %xmm4, %xmm7 496 movaps 52(%ecx), %xmm5 497 pminub %xmm2, %xmm6 498 pminub %xmm5, %xmm7 499 pminub %xmm6, %xmm7 500 pcmpeqd %xmm0, %xmm7 501 pmovmskb %xmm7, %eax 502 movaps %xmm5, %xmm7 503 palignr $12, %xmm4, %xmm5 504 test %eax, %eax 505 palignr $12, %xmm3, %xmm4 506 jnz L(Shl12Start) 507 508 palignr $12, %xmm2, %xmm3 509 lea 64(%ecx), %ecx 510 palignr $12, %xmm1, %xmm2 511 movaps %xmm7, %xmm1 512 movaps %xmm5, 48(%edx) 513 movaps %xmm4, 32(%edx) 514 movaps %xmm3, 16(%edx) 515 movaps %xmm2, (%edx) 516 lea 64(%edx), %edx 517 jmp L(Shl12LoopStart) 518 519L(Shl12LoopExit): 520 movl (%ecx), %esi 521 movl %esi, (%edx) 522 mov $4, %esi 523 524 .p2align 4 525L(CopyFrom1To16Bytes): 526 add %esi, %edx 527 add %esi, %ecx 528 529 POP (%esi) 530 test %al, %al 531 jz L(ExitHigh) 532 test $0x01, %al 533 jnz L(Exit4) 534L(Exit8): 535 movlpd (%ecx), %xmm0 536 movlpd %xmm0, (%edx) 537 movl %edi, %eax 538 RETURN 539 540 .p2align 4 541L(ExitHigh): 542 test $0x01, %ah 543 jnz L(Exit12) 544L(Exit16): 545 movdqu (%ecx), %xmm0 546 movdqu %xmm0, (%edx) 547 movl %edi, %eax 548 RETURN 549 550 .p2align 4 551L(Exit4): 552 movl (%ecx), %eax 553 movl %eax, (%edx) 554 movl %edi, %eax 555 RETURN 556 557 .p2align 4 558L(Exit12): 559 movlpd (%ecx), %xmm0 560 movlpd %xmm0, (%edx) 561 movl 8(%ecx), %eax 562 movl %eax, 8(%edx) 563 movl %edi, %eax 564 RETURN 565 566CFI_POP (%edi) 567 568 .p2align 4 569L(ExitTail4): 570 movl (%ecx), %eax 571 movl %eax, (%edx) 572 movl %edx, %eax 573 ret 574 575 .p2align 4 576L(ExitTail8): 577 movlpd (%ecx), %xmm0 578 movlpd %xmm0, (%edx) 579 movl %edx, %eax 580 ret 581 582 .p2align 4 583L(ExitTail12): 584 movlpd (%ecx), %xmm0 585 movlpd %xmm0, (%edx) 586 movl 8(%ecx), %eax 587 movl %eax, 8(%edx) 588 movl %edx, %eax 589 ret 590 591 .p2align 4 592L(ExitTail16): 593 movdqu (%ecx), %xmm0 594 movdqu %xmm0, (%edx) 595 movl %edx, %eax 596 ret 597 598END (__wcscpy_ssse3) 599#endif 600