1/* Optimized memchr with sse2 without bsf 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# define CFI_PUSH(REG) \ 24 cfi_adjust_cfa_offset (4); \ 25 cfi_rel_offset (REG, 0) 26 27# define CFI_POP(REG) \ 28 cfi_adjust_cfa_offset (-4); \ 29 cfi_restore (REG) 30 31# define PUSH(REG) pushl REG; CFI_PUSH (REG) 32# define POP(REG) popl REG; CFI_POP (REG) 33 34# ifndef USE_AS_RAWMEMCHR 35# define ENTRANCE PUSH(%edi); 36# define PARMS 8 37# define RETURN POP(%edi); ret; CFI_PUSH(%edi); 38# else 39# define ENTRANCE 40# define PARMS 4 41# endif 42 43# define STR1 PARMS 44# define STR2 STR1+4 45 46# ifndef USE_AS_RAWMEMCHR 47# define LEN STR2+4 48# endif 49 50# ifndef MEMCHR 51# define MEMCHR __memchr_sse2 52# endif 53 54 atom_text_section 55ENTRY (MEMCHR) 56 ENTRANCE 57 mov STR1(%esp), %ecx 58 movd STR2(%esp), %xmm1 59# ifndef USE_AS_RAWMEMCHR 60 mov LEN(%esp), %edx 61 test %edx, %edx 62 jz L(return_null) 63# endif 64 65 punpcklbw %xmm1, %xmm1 66# ifndef USE_AS_RAWMEMCHR 67 mov %ecx, %edi 68# else 69 mov %ecx, %edx 70# endif 71 punpcklbw %xmm1, %xmm1 72 73 and $63, %ecx 74 pshufd $0, %xmm1, %xmm1 75 cmp $48, %ecx 76 ja L(crosscache) 77 78# ifndef USE_AS_RAWMEMCHR 79 movdqu (%edi), %xmm0 80# else 81 movdqu (%edx), %xmm0 82# endif 83 pcmpeqb %xmm1, %xmm0 84 pmovmskb %xmm0, %eax 85 test %eax, %eax 86# ifndef USE_AS_RAWMEMCHR 87 jnz L(match_case2_prolog) 88 89 sub $16, %edx 90 jbe L(return_null) 91 lea 16(%edi), %edi 92 and $15, %ecx 93 and $-16, %edi 94 add %ecx, %edx 95# else 96 jnz L(match_case1_prolog) 97 lea 16(%edx), %edx 98 and $-16, %edx 99# endif 100 jmp L(loop_prolog) 101 102 .p2align 4 103L(crosscache): 104 and $15, %ecx 105# ifndef USE_AS_RAWMEMCHR 106 and $-16, %edi 107 movdqa (%edi), %xmm0 108# else 109 and $-16, %edx 110 movdqa (%edx), %xmm0 111# endif 112 pcmpeqb %xmm1, %xmm0 113 pmovmskb %xmm0, %eax 114 sar %cl, %eax 115 test %eax, %eax 116 117# ifndef USE_AS_RAWMEMCHR 118 jnz L(match_case2_prolog1) 119 /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using 120 "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void 121 possible addition overflow. */ 122 neg %ecx 123 add $16, %ecx 124 sub %ecx, %edx 125 jbe L(return_null) 126 lea 16(%edi), %edi 127# else 128 jnz L(match_case1_prolog1) 129 lea 16(%edx), %edx 130# endif 131 132 .p2align 4 133L(loop_prolog): 134# ifndef USE_AS_RAWMEMCHR 135 sub $64, %edx 136 jbe L(exit_loop) 137 movdqa (%edi), %xmm0 138# else 139 movdqa (%edx), %xmm0 140# endif 141 pcmpeqb %xmm1, %xmm0 142 xor %ecx, %ecx 143 pmovmskb %xmm0, %eax 144 test %eax, %eax 145 jnz L(match_case1) 146 147# ifndef USE_AS_RAWMEMCHR 148 movdqa 16(%edi), %xmm2 149# else 150 movdqa 16(%edx), %xmm2 151# endif 152 pcmpeqb %xmm1, %xmm2 153 lea 16(%ecx), %ecx 154 pmovmskb %xmm2, %eax 155 test %eax, %eax 156 jnz L(match_case1) 157 158# ifndef USE_AS_RAWMEMCHR 159 movdqa 32(%edi), %xmm3 160# else 161 movdqa 32(%edx), %xmm3 162# endif 163 pcmpeqb %xmm1, %xmm3 164 lea 16(%ecx), %ecx 165 pmovmskb %xmm3, %eax 166 test %eax, %eax 167 jnz L(match_case1) 168 169# ifndef USE_AS_RAWMEMCHR 170 movdqa 48(%edi), %xmm4 171# else 172 movdqa 48(%edx), %xmm4 173# endif 174 pcmpeqb %xmm1, %xmm4 175 lea 16(%ecx), %ecx 176 pmovmskb %xmm4, %eax 177 test %eax, %eax 178 jnz L(match_case1) 179 180# ifndef USE_AS_RAWMEMCHR 181 lea 64(%edi), %edi 182 sub $64, %edx 183 jbe L(exit_loop) 184 185 movdqa (%edi), %xmm0 186# else 187 lea 64(%edx), %edx 188 movdqa (%edx), %xmm0 189# endif 190 pcmpeqb %xmm1, %xmm0 191 xor %ecx, %ecx 192 pmovmskb %xmm0, %eax 193 test %eax, %eax 194 jnz L(match_case1) 195 196# ifndef USE_AS_RAWMEMCHR 197 movdqa 16(%edi), %xmm2 198# else 199 movdqa 16(%edx), %xmm2 200# endif 201 pcmpeqb %xmm1, %xmm2 202 lea 16(%ecx), %ecx 203 pmovmskb %xmm2, %eax 204 test %eax, %eax 205 jnz L(match_case1) 206 207# ifndef USE_AS_RAWMEMCHR 208 movdqa 32(%edi), %xmm3 209# else 210 movdqa 32(%edx), %xmm3 211# endif 212 pcmpeqb %xmm1, %xmm3 213 lea 16(%ecx), %ecx 214 pmovmskb %xmm3, %eax 215 test %eax, %eax 216 jnz L(match_case1) 217 218# ifndef USE_AS_RAWMEMCHR 219 movdqa 48(%edi), %xmm4 220# else 221 movdqa 48(%edx), %xmm4 222# endif 223 pcmpeqb %xmm1, %xmm4 224 lea 16(%ecx), %ecx 225 pmovmskb %xmm4, %eax 226 test %eax, %eax 227 jnz L(match_case1) 228 229# ifndef USE_AS_RAWMEMCHR 230 lea 64(%edi), %edi 231 mov %edi, %ecx 232 and $-64, %edi 233 and $63, %ecx 234 add %ecx, %edx 235# else 236 lea 64(%edx), %edx 237 and $-64, %edx 238# endif 239 240 .p2align 4 241L(align64_loop): 242 243# ifndef USE_AS_RAWMEMCHR 244 sub $64, %edx 245 jbe L(exit_loop) 246 movdqa (%edi), %xmm0 247 movdqa 16(%edi), %xmm2 248 movdqa 32(%edi), %xmm3 249 movdqa 48(%edi), %xmm4 250# else 251 movdqa (%edx), %xmm0 252 movdqa 16(%edx), %xmm2 253 movdqa 32(%edx), %xmm3 254 movdqa 48(%edx), %xmm4 255# endif 256 pcmpeqb %xmm1, %xmm0 257 pcmpeqb %xmm1, %xmm2 258 pcmpeqb %xmm1, %xmm3 259 pcmpeqb %xmm1, %xmm4 260 261 pmaxub %xmm0, %xmm3 262 pmaxub %xmm2, %xmm4 263 pmaxub %xmm3, %xmm4 264# ifndef USE_AS_RAWMEMCHR 265 add $64, %edi 266# else 267 add $64, %edx 268# endif 269 pmovmskb %xmm4, %eax 270 271 test %eax, %eax 272 jz L(align64_loop) 273 274# ifndef USE_AS_RAWMEMCHR 275 sub $64, %edi 276# else 277 sub $64, %edx 278# endif 279 280 pmovmskb %xmm0, %eax 281 xor %ecx, %ecx 282 test %eax, %eax 283 jnz L(match_case1) 284 285 pmovmskb %xmm2, %eax 286 lea 16(%ecx), %ecx 287 test %eax, %eax 288 jnz L(match_case1) 289 290# ifndef USE_AS_RAWMEMCHR 291 movdqa 32(%edi), %xmm3 292# else 293 movdqa 32(%edx), %xmm3 294# endif 295 pcmpeqb %xmm1, %xmm3 296 pmovmskb %xmm3, %eax 297 lea 16(%ecx), %ecx 298 test %eax, %eax 299 jnz L(match_case1) 300 301# ifndef USE_AS_RAWMEMCHR 302 pcmpeqb 48(%edi), %xmm1 303# else 304 pcmpeqb 48(%edx), %xmm1 305# endif 306 pmovmskb %xmm1, %eax 307 lea 16(%ecx), %ecx 308 309 .p2align 4 310L(match_case1): 311# ifndef USE_AS_RAWMEMCHR 312 add %ecx, %edi 313# else 314L(match_case1_prolog1): 315 add %ecx, %edx 316L(match_case1_prolog): 317# endif 318 test %al, %al 319 jz L(match_case1_high) 320 mov %al, %cl 321 and $15, %cl 322 jz L(match_case1_8) 323 test $0x01, %al 324 jnz L(ExitCase1_1) 325 test $0x02, %al 326 jnz L(ExitCase1_2) 327 test $0x04, %al 328 jnz L(ExitCase1_3) 329# ifndef USE_AS_RAWMEMCHR 330 lea 3(%edi), %eax 331 RETURN 332# else 333 lea 3(%edx), %eax 334 ret 335# endif 336 337 .p2align 4 338L(match_case1_8): 339 test $0x10, %al 340 jnz L(ExitCase1_5) 341 test $0x20, %al 342 jnz L(ExitCase1_6) 343 test $0x40, %al 344 jnz L(ExitCase1_7) 345# ifndef USE_AS_RAWMEMCHR 346 lea 7(%edi), %eax 347 RETURN 348# else 349 lea 7(%edx), %eax 350 ret 351# endif 352 353 .p2align 4 354L(match_case1_high): 355 mov %ah, %ch 356 and $15, %ch 357 jz L(match_case1_high_8) 358 test $0x01, %ah 359 jnz L(ExitCase1_9) 360 test $0x02, %ah 361 jnz L(ExitCase1_10) 362 test $0x04, %ah 363 jnz L(ExitCase1_11) 364# ifndef USE_AS_RAWMEMCHR 365 lea 11(%edi), %eax 366 RETURN 367# else 368 lea 11(%edx), %eax 369 ret 370# endif 371 372 .p2align 4 373L(match_case1_high_8): 374 test $0x10, %ah 375 jnz L(ExitCase1_13) 376 test $0x20, %ah 377 jnz L(ExitCase1_14) 378 test $0x40, %ah 379 jnz L(ExitCase1_15) 380# ifndef USE_AS_RAWMEMCHR 381 lea 15(%edi), %eax 382 RETURN 383# else 384 lea 15(%edx), %eax 385 ret 386# endif 387 388# ifndef USE_AS_RAWMEMCHR 389 .p2align 4 390L(exit_loop): 391 add $64, %edx 392 393 movdqa (%edi), %xmm0 394 pcmpeqb %xmm1, %xmm0 395 xor %ecx, %ecx 396 pmovmskb %xmm0, %eax 397 test %eax, %eax 398 jnz L(match_case2) 399 cmp $16, %edx 400 jbe L(return_null) 401 402 movdqa 16(%edi), %xmm2 403 pcmpeqb %xmm1, %xmm2 404 lea 16(%ecx), %ecx 405 pmovmskb %xmm2, %eax 406 test %eax, %eax 407 jnz L(match_case2) 408 cmp $32, %edx 409 jbe L(return_null) 410 411 movdqa 32(%edi), %xmm3 412 pcmpeqb %xmm1, %xmm3 413 lea 16(%ecx), %ecx 414 pmovmskb %xmm3, %eax 415 test %eax, %eax 416 jnz L(match_case2) 417 cmp $48, %edx 418 jbe L(return_null) 419 420 pcmpeqb 48(%edi), %xmm1 421 lea 16(%ecx), %ecx 422 pmovmskb %xmm1, %eax 423 test %eax, %eax 424 jnz L(match_case2) 425 426 xor %eax, %eax 427 RETURN 428# endif 429 430 .p2align 4 431L(ExitCase1_1): 432# ifndef USE_AS_RAWMEMCHR 433 mov %edi, %eax 434 RETURN 435# else 436 mov %edx, %eax 437 ret 438# endif 439 440 .p2align 4 441L(ExitCase1_2): 442# ifndef USE_AS_RAWMEMCHR 443 lea 1(%edi), %eax 444 RETURN 445# else 446 lea 1(%edx), %eax 447 ret 448# endif 449 450 .p2align 4 451L(ExitCase1_3): 452# ifndef USE_AS_RAWMEMCHR 453 lea 2(%edi), %eax 454 RETURN 455# else 456 lea 2(%edx), %eax 457 ret 458# endif 459 460 .p2align 4 461L(ExitCase1_5): 462# ifndef USE_AS_RAWMEMCHR 463 lea 4(%edi), %eax 464 RETURN 465# else 466 lea 4(%edx), %eax 467 ret 468# endif 469 470 .p2align 4 471L(ExitCase1_6): 472# ifndef USE_AS_RAWMEMCHR 473 lea 5(%edi), %eax 474 RETURN 475# else 476 lea 5(%edx), %eax 477 ret 478# endif 479 480 .p2align 4 481L(ExitCase1_7): 482# ifndef USE_AS_RAWMEMCHR 483 lea 6(%edi), %eax 484 RETURN 485# else 486 lea 6(%edx), %eax 487 ret 488# endif 489 490 .p2align 4 491L(ExitCase1_9): 492# ifndef USE_AS_RAWMEMCHR 493 lea 8(%edi), %eax 494 RETURN 495# else 496 lea 8(%edx), %eax 497 ret 498# endif 499 500 .p2align 4 501L(ExitCase1_10): 502# ifndef USE_AS_RAWMEMCHR 503 lea 9(%edi), %eax 504 RETURN 505# else 506 lea 9(%edx), %eax 507 ret 508# endif 509 510 .p2align 4 511L(ExitCase1_11): 512# ifndef USE_AS_RAWMEMCHR 513 lea 10(%edi), %eax 514 RETURN 515# else 516 lea 10(%edx), %eax 517 ret 518# endif 519 520 .p2align 4 521L(ExitCase1_13): 522# ifndef USE_AS_RAWMEMCHR 523 lea 12(%edi), %eax 524 RETURN 525# else 526 lea 12(%edx), %eax 527 ret 528# endif 529 530 .p2align 4 531L(ExitCase1_14): 532# ifndef USE_AS_RAWMEMCHR 533 lea 13(%edi), %eax 534 RETURN 535# else 536 lea 13(%edx), %eax 537 ret 538# endif 539 540 .p2align 4 541L(ExitCase1_15): 542# ifndef USE_AS_RAWMEMCHR 543 lea 14(%edi), %eax 544 RETURN 545# else 546 lea 14(%edx), %eax 547 ret 548# endif 549 550# ifndef USE_AS_RAWMEMCHR 551 .p2align 4 552L(match_case2): 553 sub %ecx, %edx 554L(match_case2_prolog1): 555 add %ecx, %edi 556L(match_case2_prolog): 557 test %al, %al 558 jz L(match_case2_high) 559 mov %al, %cl 560 and $15, %cl 561 jz L(match_case2_8) 562 test $0x01, %al 563 jnz L(ExitCase2_1) 564 test $0x02, %al 565 jnz L(ExitCase2_2) 566 test $0x04, %al 567 jnz L(ExitCase2_3) 568 sub $4, %edx 569 jb L(return_null) 570 lea 3(%edi), %eax 571 RETURN 572 573 .p2align 4 574L(match_case2_8): 575 test $0x10, %al 576 jnz L(ExitCase2_5) 577 test $0x20, %al 578 jnz L(ExitCase2_6) 579 test $0x40, %al 580 jnz L(ExitCase2_7) 581 sub $8, %edx 582 jb L(return_null) 583 lea 7(%edi), %eax 584 RETURN 585 586 .p2align 4 587L(match_case2_high): 588 mov %ah, %ch 589 and $15, %ch 590 jz L(match_case2_high_8) 591 test $0x01, %ah 592 jnz L(ExitCase2_9) 593 test $0x02, %ah 594 jnz L(ExitCase2_10) 595 test $0x04, %ah 596 jnz L(ExitCase2_11) 597 sub $12, %edx 598 jb L(return_null) 599 lea 11(%edi), %eax 600 RETURN 601 602 .p2align 4 603L(match_case2_high_8): 604 test $0x10, %ah 605 jnz L(ExitCase2_13) 606 test $0x20, %ah 607 jnz L(ExitCase2_14) 608 test $0x40, %ah 609 jnz L(ExitCase2_15) 610 sub $16, %edx 611 jb L(return_null) 612 lea 15(%edi), %eax 613 RETURN 614 615 .p2align 4 616L(ExitCase2_1): 617 mov %edi, %eax 618 RETURN 619 620 .p2align 4 621L(ExitCase2_2): 622 sub $2, %edx 623 jb L(return_null) 624 lea 1(%edi), %eax 625 RETURN 626 627 .p2align 4 628L(ExitCase2_3): 629 sub $3, %edx 630 jb L(return_null) 631 lea 2(%edi), %eax 632 RETURN 633 634 .p2align 4 635L(ExitCase2_5): 636 sub $5, %edx 637 jb L(return_null) 638 lea 4(%edi), %eax 639 RETURN 640 641 .p2align 4 642L(ExitCase2_6): 643 sub $6, %edx 644 jb L(return_null) 645 lea 5(%edi), %eax 646 RETURN 647 648 .p2align 4 649L(ExitCase2_7): 650 sub $7, %edx 651 jb L(return_null) 652 lea 6(%edi), %eax 653 RETURN 654 655 .p2align 4 656L(ExitCase2_9): 657 sub $9, %edx 658 jb L(return_null) 659 lea 8(%edi), %eax 660 RETURN 661 662 .p2align 4 663L(ExitCase2_10): 664 sub $10, %edx 665 jb L(return_null) 666 lea 9(%edi), %eax 667 RETURN 668 669 .p2align 4 670L(ExitCase2_11): 671 sub $11, %edx 672 jb L(return_null) 673 lea 10(%edi), %eax 674 RETURN 675 676 .p2align 4 677L(ExitCase2_13): 678 sub $13, %edx 679 jb L(return_null) 680 lea 12(%edi), %eax 681 RETURN 682 683 .p2align 4 684L(ExitCase2_14): 685 sub $14, %edx 686 jb L(return_null) 687 lea 13(%edi), %eax 688 RETURN 689 690 .p2align 4 691L(ExitCase2_15): 692 sub $15, %edx 693 jb L(return_null) 694 lea 14(%edi), %eax 695 RETURN 696# endif 697 698 .p2align 4 699L(return_null): 700 xor %eax, %eax 701# ifndef USE_AS_RAWMEMCHR 702 RETURN 703# else 704 ret 705# endif 706 707END (MEMCHR) 708#endif 709