1/* Optimized memrchr with sse2 without bsf 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22# define CFI_PUSH(REG) \ 23 cfi_adjust_cfa_offset (4); \ 24 cfi_rel_offset (REG, 0) 25 26# define CFI_POP(REG) \ 27 cfi_adjust_cfa_offset (-4); \ 28 cfi_restore (REG) 29 30# define PUSH(REG) pushl REG; CFI_PUSH (REG) 31# define POP(REG) popl REG; CFI_POP (REG) 32 33# define PARMS 4 34# define STR1 PARMS 35# define STR2 STR1+4 36# define LEN STR2+4 37 38 atom_text_section 39ENTRY (__memrchr_sse2) 40 mov STR1(%esp), %ecx 41 movd STR2(%esp), %xmm1 42 mov LEN(%esp), %edx 43 44 sub $16, %edx 45 jbe L(length_less16) 46 47 punpcklbw %xmm1, %xmm1 48 add %edx, %ecx 49 punpcklbw %xmm1, %xmm1 50 51 movdqu (%ecx), %xmm0 52 pshufd $0, %xmm1, %xmm1 53 pcmpeqb %xmm1, %xmm0 54 55 pmovmskb %xmm0, %eax 56 test %eax, %eax 57 jnz L(exit_dispatch) 58 59 sub $64, %ecx 60 mov %ecx, %eax 61 and $15, %eax 62 jz L(loop_prolog) 63 64 lea 16(%ecx), %ecx 65 lea 16(%edx), %edx 66 sub %eax, %edx 67 and $-16, %ecx 68 69 .p2align 4 70/* Loop start on aligned string. */ 71L(loop_prolog): 72 sub $64, %edx 73 jbe L(exit_loop) 74 75 movdqa 48(%ecx), %xmm0 76 pcmpeqb %xmm1, %xmm0 77 pmovmskb %xmm0, %eax 78 test %eax, %eax 79 jnz L(matches48) 80 81 movdqa 32(%ecx), %xmm2 82 pcmpeqb %xmm1, %xmm2 83 pmovmskb %xmm2, %eax 84 test %eax, %eax 85 jnz L(matches32) 86 87 movdqa 16(%ecx), %xmm3 88 pcmpeqb %xmm1, %xmm3 89 pmovmskb %xmm3, %eax 90 test %eax, %eax 91 jnz L(matches16) 92 93 movdqa (%ecx), %xmm4 94 pcmpeqb %xmm1, %xmm4 95 pmovmskb %xmm4, %eax 96 test %eax, %eax 97 jnz L(exit_dispatch) 98 99 sub $64, %ecx 100 sub $64, %edx 101 jbe L(exit_loop) 102 103 movdqa 48(%ecx), %xmm0 104 pcmpeqb %xmm1, %xmm0 105 pmovmskb %xmm0, %eax 106 test %eax, %eax 107 jnz L(matches48) 108 109 movdqa 32(%ecx), %xmm2 110 pcmpeqb %xmm1, %xmm2 111 pmovmskb %xmm2, %eax 112 test %eax, %eax 113 jnz L(matches32) 114 115 movdqa 16(%ecx), %xmm3 116 pcmpeqb %xmm1, %xmm3 117 pmovmskb %xmm3, %eax 118 test %eax, %eax 119 jnz L(matches16) 120 121 movdqa (%ecx), %xmm3 122 pcmpeqb %xmm1, %xmm3 123 pmovmskb %xmm3, %eax 124 test %eax, %eax 125 jnz L(exit_dispatch) 126 127 mov %ecx, %eax 128 and $63, %eax 129 test %eax, %eax 130 jz L(align64_loop) 131 132 lea 64(%ecx), %ecx 133 lea 64(%edx), %edx 134 and $-64, %ecx 135 sub %eax, %edx 136 137 .p2align 4 138L(align64_loop): 139 sub $64, %ecx 140 sub $64, %edx 141 jbe L(exit_loop) 142 143 movdqa (%ecx), %xmm0 144 movdqa 16(%ecx), %xmm2 145 movdqa 32(%ecx), %xmm3 146 movdqa 48(%ecx), %xmm4 147 148 pcmpeqb %xmm1, %xmm0 149 pcmpeqb %xmm1, %xmm2 150 pcmpeqb %xmm1, %xmm3 151 pcmpeqb %xmm1, %xmm4 152 153 pmaxub %xmm3, %xmm0 154 pmaxub %xmm4, %xmm2 155 pmaxub %xmm0, %xmm2 156 pmovmskb %xmm2, %eax 157 158 test %eax, %eax 159 jz L(align64_loop) 160 161 pmovmskb %xmm4, %eax 162 test %eax, %eax 163 jnz L(matches48) 164 165 pmovmskb %xmm3, %eax 166 test %eax, %eax 167 jnz L(matches32) 168 169 movdqa 16(%ecx), %xmm2 170 171 pcmpeqb %xmm1, %xmm2 172 pcmpeqb (%ecx), %xmm1 173 174 pmovmskb %xmm2, %eax 175 test %eax, %eax 176 jnz L(matches16) 177 178 pmovmskb %xmm1, %eax 179 test %ah, %ah 180 jnz L(exit_dispatch_high) 181 mov %al, %dl 182 and $15 << 4, %dl 183 jnz L(exit_dispatch_8) 184 test $0x08, %al 185 jnz L(exit_4) 186 test $0x04, %al 187 jnz L(exit_3) 188 test $0x02, %al 189 jnz L(exit_2) 190 mov %ecx, %eax 191 ret 192 193 .p2align 4 194L(exit_loop): 195 add $64, %edx 196 cmp $32, %edx 197 jbe L(exit_loop_32) 198 199 movdqa 48(%ecx), %xmm0 200 pcmpeqb %xmm1, %xmm0 201 pmovmskb %xmm0, %eax 202 test %eax, %eax 203 jnz L(matches48) 204 205 movdqa 32(%ecx), %xmm2 206 pcmpeqb %xmm1, %xmm2 207 pmovmskb %xmm2, %eax 208 test %eax, %eax 209 jnz L(matches32) 210 211 movdqa 16(%ecx), %xmm3 212 pcmpeqb %xmm1, %xmm3 213 pmovmskb %xmm3, %eax 214 test %eax, %eax 215 jnz L(matches16_1) 216 cmp $48, %edx 217 jbe L(return_null) 218 219 pcmpeqb (%ecx), %xmm1 220 pmovmskb %xmm1, %eax 221 test %eax, %eax 222 jnz L(matches0_1) 223 xor %eax, %eax 224 ret 225 226 .p2align 4 227L(exit_loop_32): 228 movdqa 48(%ecx), %xmm0 229 pcmpeqb %xmm1, %xmm0 230 pmovmskb %xmm0, %eax 231 test %eax, %eax 232 jnz L(matches48_1) 233 cmp $16, %edx 234 jbe L(return_null) 235 236 pcmpeqb 32(%ecx), %xmm1 237 pmovmskb %xmm1, %eax 238 test %eax, %eax 239 jnz L(matches32_1) 240 xor %eax, %eax 241 ret 242 243 .p2align 4 244L(matches16): 245 lea 16(%ecx), %ecx 246 test %ah, %ah 247 jnz L(exit_dispatch_high) 248 mov %al, %dl 249 and $15 << 4, %dl 250 jnz L(exit_dispatch_8) 251 test $0x08, %al 252 jnz L(exit_4) 253 test $0x04, %al 254 jnz L(exit_3) 255 test $0x02, %al 256 jnz L(exit_2) 257 mov %ecx, %eax 258 ret 259 260 .p2align 4 261L(matches32): 262 lea 32(%ecx), %ecx 263 test %ah, %ah 264 jnz L(exit_dispatch_high) 265 mov %al, %dl 266 and $15 << 4, %dl 267 jnz L(exit_dispatch_8) 268 test $0x08, %al 269 jnz L(exit_4) 270 test $0x04, %al 271 jnz L(exit_3) 272 test $0x02, %al 273 jnz L(exit_2) 274 mov %ecx, %eax 275 ret 276 277 .p2align 4 278L(matches48): 279 lea 48(%ecx), %ecx 280 281 .p2align 4 282L(exit_dispatch): 283 test %ah, %ah 284 jnz L(exit_dispatch_high) 285 mov %al, %dl 286 and $15 << 4, %dl 287 jnz L(exit_dispatch_8) 288 test $0x08, %al 289 jnz L(exit_4) 290 test $0x04, %al 291 jnz L(exit_3) 292 test $0x02, %al 293 jnz L(exit_2) 294 mov %ecx, %eax 295 ret 296 297 .p2align 4 298L(exit_dispatch_8): 299 test $0x80, %al 300 jnz L(exit_8) 301 test $0x40, %al 302 jnz L(exit_7) 303 test $0x20, %al 304 jnz L(exit_6) 305 lea 4(%ecx), %eax 306 ret 307 308 .p2align 4 309L(exit_dispatch_high): 310 mov %ah, %dh 311 and $15 << 4, %dh 312 jnz L(exit_dispatch_high_8) 313 test $0x08, %ah 314 jnz L(exit_12) 315 test $0x04, %ah 316 jnz L(exit_11) 317 test $0x02, %ah 318 jnz L(exit_10) 319 lea 8(%ecx), %eax 320 ret 321 322 .p2align 4 323L(exit_dispatch_high_8): 324 test $0x80, %ah 325 jnz L(exit_16) 326 test $0x40, %ah 327 jnz L(exit_15) 328 test $0x20, %ah 329 jnz L(exit_14) 330 lea 12(%ecx), %eax 331 ret 332 333 .p2align 4 334L(exit_2): 335 lea 1(%ecx), %eax 336 ret 337 338 .p2align 4 339L(exit_3): 340 lea 2(%ecx), %eax 341 ret 342 343 .p2align 4 344L(exit_4): 345 lea 3(%ecx), %eax 346 ret 347 348 .p2align 4 349L(exit_6): 350 lea 5(%ecx), %eax 351 ret 352 353 .p2align 4 354L(exit_7): 355 lea 6(%ecx), %eax 356 ret 357 358 .p2align 4 359L(exit_8): 360 lea 7(%ecx), %eax 361 ret 362 363 .p2align 4 364L(exit_10): 365 lea 9(%ecx), %eax 366 ret 367 368 .p2align 4 369L(exit_11): 370 lea 10(%ecx), %eax 371 ret 372 373 .p2align 4 374L(exit_12): 375 lea 11(%ecx), %eax 376 ret 377 378 .p2align 4 379L(exit_14): 380 lea 13(%ecx), %eax 381 ret 382 383 .p2align 4 384L(exit_15): 385 lea 14(%ecx), %eax 386 ret 387 388 .p2align 4 389L(exit_16): 390 lea 15(%ecx), %eax 391 ret 392 393 .p2align 4 394L(matches0_1): 395 lea -64(%edx), %edx 396 397 test %ah, %ah 398 jnz L(exit_dispatch_1_high) 399 mov %al, %ah 400 and $15 << 4, %ah 401 jnz L(exit_dispatch_1_8) 402 test $0x08, %al 403 jnz L(exit_1_4) 404 test $0x04, %al 405 jnz L(exit_1_3) 406 test $0x02, %al 407 jnz L(exit_1_2) 408 add $0, %edx 409 jl L(return_null) 410 mov %ecx, %eax 411 ret 412 413 .p2align 4 414L(matches16_1): 415 lea -48(%edx), %edx 416 lea 16(%ecx), %ecx 417 418 test %ah, %ah 419 jnz L(exit_dispatch_1_high) 420 mov %al, %ah 421 and $15 << 4, %ah 422 jnz L(exit_dispatch_1_8) 423 test $0x08, %al 424 jnz L(exit_1_4) 425 test $0x04, %al 426 jnz L(exit_1_3) 427 test $0x02, %al 428 jnz L(exit_1_2) 429 add $0, %edx 430 jl L(return_null) 431 mov %ecx, %eax 432 ret 433 434 .p2align 4 435L(matches32_1): 436 lea -32(%edx), %edx 437 lea 32(%ecx), %ecx 438 439 test %ah, %ah 440 jnz L(exit_dispatch_1_high) 441 mov %al, %ah 442 and $15 << 4, %ah 443 jnz L(exit_dispatch_1_8) 444 test $0x08, %al 445 jnz L(exit_1_4) 446 test $0x04, %al 447 jnz L(exit_1_3) 448 test $0x02, %al 449 jnz L(exit_1_2) 450 add $0, %edx 451 jl L(return_null) 452 mov %ecx, %eax 453 ret 454 455 .p2align 4 456L(matches48_1): 457 lea -16(%edx), %edx 458 lea 48(%ecx), %ecx 459 460 .p2align 4 461L(exit_dispatch_1): 462 test %ah, %ah 463 jnz L(exit_dispatch_1_high) 464 mov %al, %ah 465 and $15 << 4, %ah 466 jnz L(exit_dispatch_1_8) 467 test $0x08, %al 468 jnz L(exit_1_4) 469 test $0x04, %al 470 jnz L(exit_1_3) 471 test $0x02, %al 472 jnz L(exit_1_2) 473 add $0, %edx 474 jl L(return_null) 475 mov %ecx, %eax 476 ret 477 478 .p2align 4 479L(exit_dispatch_1_8): 480 test $0x80, %al 481 jnz L(exit_1_8) 482 test $0x40, %al 483 jnz L(exit_1_7) 484 test $0x20, %al 485 jnz L(exit_1_6) 486 add $4, %edx 487 jl L(return_null) 488 lea 4(%ecx), %eax 489 ret 490 491 .p2align 4 492L(exit_dispatch_1_high): 493 mov %ah, %al 494 and $15 << 4, %al 495 jnz L(exit_dispatch_1_high_8) 496 test $0x08, %ah 497 jnz L(exit_1_12) 498 test $0x04, %ah 499 jnz L(exit_1_11) 500 test $0x02, %ah 501 jnz L(exit_1_10) 502 add $8, %edx 503 jl L(return_null) 504 lea 8(%ecx), %eax 505 ret 506 507 .p2align 4 508L(exit_dispatch_1_high_8): 509 test $0x80, %ah 510 jnz L(exit_1_16) 511 test $0x40, %ah 512 jnz L(exit_1_15) 513 test $0x20, %ah 514 jnz L(exit_1_14) 515 add $12, %edx 516 jl L(return_null) 517 lea 12(%ecx), %eax 518 ret 519 520 .p2align 4 521L(exit_1_2): 522 add $1, %edx 523 jl L(return_null) 524 lea 1(%ecx), %eax 525 ret 526 527 .p2align 4 528L(exit_1_3): 529 add $2, %edx 530 jl L(return_null) 531 lea 2(%ecx), %eax 532 ret 533 534 .p2align 4 535L(exit_1_4): 536 add $3, %edx 537 jl L(return_null) 538 lea 3(%ecx), %eax 539 ret 540 541 .p2align 4 542L(exit_1_6): 543 add $5, %edx 544 jl L(return_null) 545 lea 5(%ecx), %eax 546 ret 547 548 .p2align 4 549L(exit_1_7): 550 add $6, %edx 551 jl L(return_null) 552 lea 6(%ecx), %eax 553 ret 554 555 .p2align 4 556L(exit_1_8): 557 add $7, %edx 558 jl L(return_null) 559 lea 7(%ecx), %eax 560 ret 561 562 .p2align 4 563L(exit_1_10): 564 add $9, %edx 565 jl L(return_null) 566 lea 9(%ecx), %eax 567 ret 568 569 .p2align 4 570L(exit_1_11): 571 add $10, %edx 572 jl L(return_null) 573 lea 10(%ecx), %eax 574 ret 575 576 .p2align 4 577L(exit_1_12): 578 add $11, %edx 579 jl L(return_null) 580 lea 11(%ecx), %eax 581 ret 582 583 .p2align 4 584L(exit_1_14): 585 add $13, %edx 586 jl L(return_null) 587 lea 13(%ecx), %eax 588 ret 589 590 .p2align 4 591L(exit_1_15): 592 add $14, %edx 593 jl L(return_null) 594 lea 14(%ecx), %eax 595 ret 596 597 .p2align 4 598L(exit_1_16): 599 add $15, %edx 600 jl L(return_null) 601 lea 15(%ecx), %eax 602 ret 603 604 .p2align 4 605L(return_null): 606 xor %eax, %eax 607 ret 608 609 .p2align 4 610L(length_less16_offset0): 611 mov %dl, %cl 612 pcmpeqb (%eax), %xmm1 613 614 mov $1, %edx 615 sal %cl, %edx 616 sub $1, %edx 617 618 mov %eax, %ecx 619 pmovmskb %xmm1, %eax 620 621 and %edx, %eax 622 test %eax, %eax 623 jnz L(exit_dispatch) 624 625 xor %eax, %eax 626 ret 627 628 .p2align 4 629L(length_less16): 630 punpcklbw %xmm1, %xmm1 631 add $16, %edx 632 je L(return_null) 633 punpcklbw %xmm1, %xmm1 634 635 mov %ecx, %eax 636 pshufd $0, %xmm1, %xmm1 637 638 and $15, %ecx 639 jz L(length_less16_offset0) 640 641 PUSH (%edi) 642 643 mov %cl, %dh 644 add %dl, %dh 645 and $-16, %eax 646 647 sub $16, %dh 648 ja L(length_less16_part2) 649 650 pcmpeqb (%eax), %xmm1 651 pmovmskb %xmm1, %edi 652 653 sar %cl, %edi 654 add %ecx, %eax 655 mov %dl, %cl 656 657 mov $1, %edx 658 sal %cl, %edx 659 sub $1, %edx 660 661 and %edx, %edi 662 test %edi, %edi 663 jz L(ret_null) 664 665 bsr %edi, %edi 666 add %edi, %eax 667 POP (%edi) 668 ret 669 670 CFI_PUSH (%edi) 671 672 .p2align 4 673L(length_less16_part2): 674 movdqa 16(%eax), %xmm2 675 pcmpeqb %xmm1, %xmm2 676 pmovmskb %xmm2, %edi 677 678 mov %cl, %ch 679 680 mov %dh, %cl 681 mov $1, %edx 682 sal %cl, %edx 683 sub $1, %edx 684 685 and %edx, %edi 686 687 test %edi, %edi 688 jnz L(length_less16_part2_return) 689 690 pcmpeqb (%eax), %xmm1 691 pmovmskb %xmm1, %edi 692 693 mov %ch, %cl 694 sar %cl, %edi 695 test %edi, %edi 696 jz L(ret_null) 697 698 bsr %edi, %edi 699 add %edi, %eax 700 xor %ch, %ch 701 add %ecx, %eax 702 POP (%edi) 703 ret 704 705 CFI_PUSH (%edi) 706 707 .p2align 4 708L(length_less16_part2_return): 709 bsr %edi, %edi 710 lea 16(%eax, %edi), %eax 711 POP (%edi) 712 ret 713 714 CFI_PUSH (%edi) 715 716 .p2align 4 717L(ret_null): 718 xor %eax, %eax 719 POP (%edi) 720 ret 721 722END (__memrchr_sse2) 723#endif 724