1/* memcmp with SSE4.2, wmemcmp with SSE4.2 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMCMP 24# define MEMCMP __memcmp_sse4_2 25# endif 26 27# define CFI_PUSH(REG) \ 28 cfi_adjust_cfa_offset (4); \ 29 cfi_rel_offset (REG, 0) 30 31# define CFI_POP(REG) \ 32 cfi_adjust_cfa_offset (-4); \ 33 cfi_restore (REG) 34 35# define PUSH(REG) pushl REG; CFI_PUSH (REG) 36# define POP(REG) popl REG; CFI_POP (REG) 37 38# define PARMS 4 39# define BLK1 PARMS 40# define BLK2 BLK1 + 4 41# define LEN BLK2 + 4 42# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) 43 44 45# ifdef PIC 46# define JMPTBL(I, B) I - B 47 48/* Load an entry in a jump table into EBX and branch to it. TABLE is a 49 jump table with relative offsets. INDEX is a register contains the 50 index into the jump table. SCALE is the scale of INDEX. */ 51 52# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 53/* We first load PC into EBX. */ \ 54 SETUP_PIC_REG(bx); \ 55/* Get the address of the jump table. */ \ 56 addl $(TABLE - .), %ebx; \ 57/* Get the entry and convert the relative offset to the \ 58 absolute address. */ \ 59 addl (%ebx,INDEX,SCALE), %ebx; \ 60/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ 61 _CET_NOTRACK jmp *%ebx 62# else 63# define JMPTBL(I, B) I 64 65/* Load an entry in a jump table into EBX and branch to it. TABLE is a 66 jump table with relative offsets. INDEX is a register contains the 67 index into the jump table. SCALE is the scale of INDEX. */ 68# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 69 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE) 70# endif 71 72 73/* Warning! 74 wmemcmp has to use SIGNED comparison for elements. 75 memcmp has to use UNSIGNED comparison for elemnts. 76*/ 77 78 .section .text.sse4.2,"ax",@progbits 79ENTRY (MEMCMP) 80 movl BLK1(%esp), %eax 81 movl BLK2(%esp), %edx 82 movl LEN(%esp), %ecx 83 84# ifdef USE_AS_WMEMCMP 85 shl $2, %ecx 86 test %ecx, %ecx 87 jz L(return0) 88# else 89 cmp $1, %ecx 90 jbe L(less1bytes) 91# endif 92 93 pxor %xmm0, %xmm0 94 cmp $64, %ecx 95 ja L(64bytesormore) 96 cmp $8, %ecx 97 98# ifndef USE_AS_WMEMCMP 99 PUSH (%ebx) 100 jb L(less8bytes) 101# else 102 jb L(less8bytes) 103 PUSH (%ebx) 104# endif 105 106 add %ecx, %edx 107 add %ecx, %eax 108 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) 109 110# ifndef USE_AS_WMEMCMP 111 .p2align 4 112L(less8bytes): 113 mov (%eax), %bl 114 cmpb (%edx), %bl 115 jne L(nonzero) 116 117 mov 1(%eax), %bl 118 cmpb 1(%edx), %bl 119 jne L(nonzero) 120 121 cmp $2, %ecx 122 jz L(0bytes) 123 124 mov 2(%eax), %bl 125 cmpb 2(%edx), %bl 126 jne L(nonzero) 127 128 cmp $3, %ecx 129 jz L(0bytes) 130 131 mov 3(%eax), %bl 132 cmpb 3(%edx), %bl 133 jne L(nonzero) 134 135 cmp $4, %ecx 136 jz L(0bytes) 137 138 mov 4(%eax), %bl 139 cmpb 4(%edx), %bl 140 jne L(nonzero) 141 142 cmp $5, %ecx 143 jz L(0bytes) 144 145 mov 5(%eax), %bl 146 cmpb 5(%edx), %bl 147 jne L(nonzero) 148 149 cmp $6, %ecx 150 jz L(0bytes) 151 152 mov 6(%eax), %bl 153 cmpb 6(%edx), %bl 154 je L(0bytes) 155 156L(nonzero): 157 POP (%ebx) 158 mov $1, %eax 159 ja L(above) 160 neg %eax 161L(above): 162 ret 163 CFI_PUSH (%ebx) 164# endif 165 166 .p2align 4 167L(0bytes): 168 POP (%ebx) 169 xor %eax, %eax 170 ret 171 172# ifdef USE_AS_WMEMCMP 173 174/* for wmemcmp, case N == 1 */ 175 176 .p2align 4 177L(less8bytes): 178 mov (%eax), %ecx 179 cmp (%edx), %ecx 180 je L(return0) 181 mov $1, %eax 182 jg L(find_diff_bigger) 183 neg %eax 184 ret 185 186 .p2align 4 187L(find_diff_bigger): 188 ret 189 190 .p2align 4 191L(return0): 192 xor %eax, %eax 193 ret 194# endif 195 196# ifndef USE_AS_WMEMCMP 197 .p2align 4 198L(less1bytes): 199 jb L(0bytesend) 200 movzbl (%eax), %eax 201 movzbl (%edx), %edx 202 sub %edx, %eax 203 ret 204 205 .p2align 4 206L(0bytesend): 207 xor %eax, %eax 208 ret 209# endif 210 .p2align 4 211L(64bytesormore): 212 PUSH (%ebx) 213 mov %ecx, %ebx 214 mov $64, %ecx 215 sub $64, %ebx 216L(64bytesormore_loop): 217 movdqu (%eax), %xmm1 218 movdqu (%edx), %xmm2 219 pxor %xmm1, %xmm2 220 ptest %xmm2, %xmm0 221 jnc L(find_16diff) 222 223 movdqu 16(%eax), %xmm1 224 movdqu 16(%edx), %xmm2 225 pxor %xmm1, %xmm2 226 ptest %xmm2, %xmm0 227 jnc L(find_32diff) 228 229 movdqu 32(%eax), %xmm1 230 movdqu 32(%edx), %xmm2 231 pxor %xmm1, %xmm2 232 ptest %xmm2, %xmm0 233 jnc L(find_48diff) 234 235 movdqu 48(%eax), %xmm1 236 movdqu 48(%edx), %xmm2 237 pxor %xmm1, %xmm2 238 ptest %xmm2, %xmm0 239 jnc L(find_64diff) 240 add %ecx, %eax 241 add %ecx, %edx 242 sub %ecx, %ebx 243 jae L(64bytesormore_loop) 244 add %ebx, %ecx 245 add %ecx, %edx 246 add %ecx, %eax 247 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) 248 249# ifdef USE_AS_WMEMCMP 250 251/* Label needs only for table_64bytes filling */ 252L(unreal_case): 253/* no code here */ 254 255# endif 256 .p2align 4 257L(find_16diff): 258 sub $16, %ecx 259L(find_32diff): 260 sub $16, %ecx 261L(find_48diff): 262 sub $16, %ecx 263L(find_64diff): 264 add %ecx, %edx 265 add %ecx, %eax 266 267# ifndef USE_AS_WMEMCMP 268 .p2align 4 269L(16bytes): 270 mov -16(%eax), %ecx 271 mov -16(%edx), %ebx 272 cmp %ebx, %ecx 273 jne L(find_diff) 274L(12bytes): 275 mov -12(%eax), %ecx 276 mov -12(%edx), %ebx 277 cmp %ebx, %ecx 278 jne L(find_diff) 279L(8bytes): 280 mov -8(%eax), %ecx 281 mov -8(%edx), %ebx 282 cmp %ebx, %ecx 283 jne L(find_diff) 284L(4bytes): 285 mov -4(%eax), %ecx 286 mov -4(%edx), %ebx 287 cmp %ebx, %ecx 288 mov $0, %eax 289 jne L(find_diff) 290 RETURN 291# else 292 .p2align 4 293L(16bytes): 294 mov -16(%eax), %ecx 295 cmp -16(%edx), %ecx 296 jne L(find_diff) 297L(12bytes): 298 mov -12(%eax), %ecx 299 cmp -12(%edx), %ecx 300 jne L(find_diff) 301L(8bytes): 302 mov -8(%eax), %ecx 303 cmp -8(%edx), %ecx 304 jne L(find_diff) 305L(4bytes): 306 mov -4(%eax), %ecx 307 cmp -4(%edx), %ecx 308 mov $0, %eax 309 jne L(find_diff) 310 RETURN 311# endif 312 313# ifndef USE_AS_WMEMCMP 314 .p2align 4 315L(49bytes): 316 movdqu -49(%eax), %xmm1 317 movdqu -49(%edx), %xmm2 318 mov $-49, %ebx 319 pxor %xmm1, %xmm2 320 ptest %xmm2, %xmm0 321 jnc L(less16bytes) 322L(33bytes): 323 movdqu -33(%eax), %xmm1 324 movdqu -33(%edx), %xmm2 325 mov $-33, %ebx 326 pxor %xmm1, %xmm2 327 ptest %xmm2, %xmm0 328 jnc L(less16bytes) 329L(17bytes): 330 mov -17(%eax), %ecx 331 mov -17(%edx), %ebx 332 cmp %ebx, %ecx 333 jne L(find_diff) 334L(13bytes): 335 mov -13(%eax), %ecx 336 mov -13(%edx), %ebx 337 cmp %ebx, %ecx 338 jne L(find_diff) 339L(9bytes): 340 mov -9(%eax), %ecx 341 mov -9(%edx), %ebx 342 cmp %ebx, %ecx 343 jne L(find_diff) 344L(5bytes): 345 mov -5(%eax), %ecx 346 mov -5(%edx), %ebx 347 cmp %ebx, %ecx 348 jne L(find_diff) 349 movzbl -1(%eax), %ecx 350 cmp -1(%edx), %cl 351 mov $0, %eax 352 jne L(end) 353 RETURN 354 355 .p2align 4 356L(50bytes): 357 mov $-50, %ebx 358 movdqu -50(%eax), %xmm1 359 movdqu -50(%edx), %xmm2 360 pxor %xmm1, %xmm2 361 ptest %xmm2, %xmm0 362 jnc L(less16bytes) 363L(34bytes): 364 mov $-34, %ebx 365 movdqu -34(%eax), %xmm1 366 movdqu -34(%edx), %xmm2 367 pxor %xmm1, %xmm2 368 ptest %xmm2, %xmm0 369 jnc L(less16bytes) 370L(18bytes): 371 mov -18(%eax), %ecx 372 mov -18(%edx), %ebx 373 cmp %ebx, %ecx 374 jne L(find_diff) 375L(14bytes): 376 mov -14(%eax), %ecx 377 mov -14(%edx), %ebx 378 cmp %ebx, %ecx 379 jne L(find_diff) 380L(10bytes): 381 mov -10(%eax), %ecx 382 mov -10(%edx), %ebx 383 cmp %ebx, %ecx 384 jne L(find_diff) 385L(6bytes): 386 mov -6(%eax), %ecx 387 mov -6(%edx), %ebx 388 cmp %ebx, %ecx 389 jne L(find_diff) 390L(2bytes): 391 movzwl -2(%eax), %ecx 392 movzwl -2(%edx), %ebx 393 cmp %bl, %cl 394 jne L(end) 395 cmp %bh, %ch 396 mov $0, %eax 397 jne L(end) 398 RETURN 399 400 .p2align 4 401L(51bytes): 402 mov $-51, %ebx 403 movdqu -51(%eax), %xmm1 404 movdqu -51(%edx), %xmm2 405 pxor %xmm1, %xmm2 406 ptest %xmm2, %xmm0 407 jnc L(less16bytes) 408L(35bytes): 409 mov $-35, %ebx 410 movdqu -35(%eax), %xmm1 411 movdqu -35(%edx), %xmm2 412 pxor %xmm1, %xmm2 413 ptest %xmm2, %xmm0 414 jnc L(less16bytes) 415L(19bytes): 416 movl -19(%eax), %ecx 417 movl -19(%edx), %ebx 418 cmp %ebx, %ecx 419 jne L(find_diff) 420L(15bytes): 421 movl -15(%eax), %ecx 422 movl -15(%edx), %ebx 423 cmp %ebx, %ecx 424 jne L(find_diff) 425L(11bytes): 426 movl -11(%eax), %ecx 427 movl -11(%edx), %ebx 428 cmp %ebx, %ecx 429 jne L(find_diff) 430L(7bytes): 431 movl -7(%eax), %ecx 432 movl -7(%edx), %ebx 433 cmp %ebx, %ecx 434 jne L(find_diff) 435L(3bytes): 436 movzwl -3(%eax), %ecx 437 movzwl -3(%edx), %ebx 438 cmpb %bl, %cl 439 jne L(end) 440 cmp %bx, %cx 441 jne L(end) 442L(1bytes): 443 movzbl -1(%eax), %eax 444 cmpb -1(%edx), %al 445 mov $0, %eax 446 jne L(end) 447 RETURN 448# endif 449 .p2align 4 450L(52bytes): 451 movdqu -52(%eax), %xmm1 452 movdqu -52(%edx), %xmm2 453 mov $-52, %ebx 454 pxor %xmm1, %xmm2 455 ptest %xmm2, %xmm0 456 jnc L(less16bytes) 457L(36bytes): 458 movdqu -36(%eax), %xmm1 459 movdqu -36(%edx), %xmm2 460 mov $-36, %ebx 461 pxor %xmm1, %xmm2 462 ptest %xmm2, %xmm0 463 jnc L(less16bytes) 464L(20bytes): 465 movdqu -20(%eax), %xmm1 466 movdqu -20(%edx), %xmm2 467 mov $-20, %ebx 468 pxor %xmm1, %xmm2 469 ptest %xmm2, %xmm0 470 jnc L(less16bytes) 471 mov -4(%eax), %ecx 472# ifndef USE_AS_WMEMCMP 473 mov -4(%edx), %ebx 474 cmp %ebx, %ecx 475# else 476 cmp -4(%edx), %ecx 477# endif 478 mov $0, %eax 479 jne L(find_diff) 480 RETURN 481 482# ifndef USE_AS_WMEMCMP 483 .p2align 4 484L(53bytes): 485 movdqu -53(%eax), %xmm1 486 movdqu -53(%edx), %xmm2 487 mov $-53, %ebx 488 pxor %xmm1, %xmm2 489 ptest %xmm2, %xmm0 490 jnc L(less16bytes) 491L(37bytes): 492 mov $-37, %ebx 493 movdqu -37(%eax), %xmm1 494 movdqu -37(%edx), %xmm2 495 pxor %xmm1, %xmm2 496 ptest %xmm2, %xmm0 497 jnc L(less16bytes) 498L(21bytes): 499 mov $-21, %ebx 500 movdqu -21(%eax), %xmm1 501 movdqu -21(%edx), %xmm2 502 pxor %xmm1, %xmm2 503 ptest %xmm2, %xmm0 504 jnc L(less16bytes) 505 mov -5(%eax), %ecx 506 mov -5(%edx), %ebx 507 cmp %ebx, %ecx 508 jne L(find_diff) 509 movzbl -1(%eax), %ecx 510 cmp -1(%edx), %cl 511 mov $0, %eax 512 jne L(end) 513 RETURN 514 515 .p2align 4 516L(54bytes): 517 movdqu -54(%eax), %xmm1 518 movdqu -54(%edx), %xmm2 519 mov $-54, %ebx 520 pxor %xmm1, %xmm2 521 ptest %xmm2, %xmm0 522 jnc L(less16bytes) 523L(38bytes): 524 mov $-38, %ebx 525 movdqu -38(%eax), %xmm1 526 movdqu -38(%edx), %xmm2 527 pxor %xmm1, %xmm2 528 ptest %xmm2, %xmm0 529 jnc L(less16bytes) 530L(22bytes): 531 mov $-22, %ebx 532 movdqu -22(%eax), %xmm1 533 movdqu -22(%edx), %xmm2 534 pxor %xmm1, %xmm2 535 ptest %xmm2, %xmm0 536 jnc L(less16bytes) 537 538 mov -6(%eax), %ecx 539 mov -6(%edx), %ebx 540 cmp %ebx, %ecx 541 jne L(find_diff) 542 movzwl -2(%eax), %ecx 543 movzwl -2(%edx), %ebx 544 cmp %bl, %cl 545 jne L(end) 546 cmp %bh, %ch 547 mov $0, %eax 548 jne L(end) 549 RETURN 550 551 .p2align 4 552L(55bytes): 553 movdqu -55(%eax), %xmm1 554 movdqu -55(%edx), %xmm2 555 mov $-55, %ebx 556 pxor %xmm1, %xmm2 557 ptest %xmm2, %xmm0 558 jnc L(less16bytes) 559L(39bytes): 560 mov $-39, %ebx 561 movdqu -39(%eax), %xmm1 562 movdqu -39(%edx), %xmm2 563 pxor %xmm1, %xmm2 564 ptest %xmm2, %xmm0 565 jnc L(less16bytes) 566L(23bytes): 567 mov $-23, %ebx 568 movdqu -23(%eax), %xmm1 569 movdqu -23(%edx), %xmm2 570 pxor %xmm1, %xmm2 571 ptest %xmm2, %xmm0 572 jnc L(less16bytes) 573 movl -7(%eax), %ecx 574 movl -7(%edx), %ebx 575 cmp %ebx, %ecx 576 jne L(find_diff) 577 movzwl -3(%eax), %ecx 578 movzwl -3(%edx), %ebx 579 cmpb %bl, %cl 580 jne L(end) 581 cmp %bx, %cx 582 jne L(end) 583 movzbl -1(%eax), %eax 584 cmpb -1(%edx), %al 585 mov $0, %eax 586 jne L(end) 587 RETURN 588# endif 589 .p2align 4 590L(56bytes): 591 movdqu -56(%eax), %xmm1 592 movdqu -56(%edx), %xmm2 593 mov $-56, %ebx 594 pxor %xmm1, %xmm2 595 ptest %xmm2, %xmm0 596 jnc L(less16bytes) 597L(40bytes): 598 mov $-40, %ebx 599 movdqu -40(%eax), %xmm1 600 movdqu -40(%edx), %xmm2 601 pxor %xmm1, %xmm2 602 ptest %xmm2, %xmm0 603 jnc L(less16bytes) 604L(24bytes): 605 mov $-24, %ebx 606 movdqu -24(%eax), %xmm1 607 movdqu -24(%edx), %xmm2 608 pxor %xmm1, %xmm2 609 ptest %xmm2, %xmm0 610 jnc L(less16bytes) 611 612 mov -8(%eax), %ecx 613# ifndef USE_AS_WMEMCMP 614 mov -8(%edx), %ebx 615 cmp %ebx, %ecx 616# else 617 cmp -8(%edx), %ecx 618# endif 619 jne L(find_diff) 620 621 mov -4(%eax), %ecx 622# ifndef USE_AS_WMEMCMP 623 mov -4(%edx), %ebx 624 cmp %ebx, %ecx 625# else 626 cmp -4(%edx), %ecx 627# endif 628 mov $0, %eax 629 jne L(find_diff) 630 RETURN 631 632# ifndef USE_AS_WMEMCMP 633 .p2align 4 634L(57bytes): 635 movdqu -57(%eax), %xmm1 636 movdqu -57(%edx), %xmm2 637 mov $-57, %ebx 638 pxor %xmm1, %xmm2 639 ptest %xmm2, %xmm0 640 jnc L(less16bytes) 641L(41bytes): 642 mov $-41, %ebx 643 movdqu -41(%eax), %xmm1 644 movdqu -41(%edx), %xmm2 645 pxor %xmm1, %xmm2 646 ptest %xmm2, %xmm0 647 jnc L(less16bytes) 648L(25bytes): 649 mov $-25, %ebx 650 movdqu -25(%eax), %xmm1 651 movdqu -25(%edx), %xmm2 652 pxor %xmm1, %xmm2 653 ptest %xmm2, %xmm0 654 jnc L(less16bytes) 655 mov -9(%eax), %ecx 656 mov -9(%edx), %ebx 657 cmp %ebx, %ecx 658 jne L(find_diff) 659 mov -5(%eax), %ecx 660 mov -5(%edx), %ebx 661 cmp %ebx, %ecx 662 jne L(find_diff) 663 movzbl -1(%eax), %ecx 664 cmp -1(%edx), %cl 665 mov $0, %eax 666 jne L(end) 667 RETURN 668 669 .p2align 4 670L(58bytes): 671 movdqu -58(%eax), %xmm1 672 movdqu -58(%edx), %xmm2 673 mov $-58, %ebx 674 pxor %xmm1, %xmm2 675 ptest %xmm2, %xmm0 676 jnc L(less16bytes) 677L(42bytes): 678 mov $-42, %ebx 679 movdqu -42(%eax), %xmm1 680 movdqu -42(%edx), %xmm2 681 pxor %xmm1, %xmm2 682 ptest %xmm2, %xmm0 683 jnc L(less16bytes) 684L(26bytes): 685 mov $-26, %ebx 686 movdqu -26(%eax), %xmm1 687 movdqu -26(%edx), %xmm2 688 pxor %xmm1, %xmm2 689 ptest %xmm2, %xmm0 690 jnc L(less16bytes) 691 692 mov -10(%eax), %ecx 693 mov -10(%edx), %ebx 694 cmp %ebx, %ecx 695 jne L(find_diff) 696 697 mov -6(%eax), %ecx 698 mov -6(%edx), %ebx 699 cmp %ebx, %ecx 700 jne L(find_diff) 701 702 movzwl -2(%eax), %ecx 703 movzwl -2(%edx), %ebx 704 cmp %bl, %cl 705 jne L(end) 706 cmp %bh, %ch 707 mov $0, %eax 708 jne L(end) 709 RETURN 710 711 .p2align 4 712L(59bytes): 713 movdqu -59(%eax), %xmm1 714 movdqu -59(%edx), %xmm2 715 mov $-59, %ebx 716 pxor %xmm1, %xmm2 717 ptest %xmm2, %xmm0 718 jnc L(less16bytes) 719L(43bytes): 720 mov $-43, %ebx 721 movdqu -43(%eax), %xmm1 722 movdqu -43(%edx), %xmm2 723 pxor %xmm1, %xmm2 724 ptest %xmm2, %xmm0 725 jnc L(less16bytes) 726L(27bytes): 727 mov $-27, %ebx 728 movdqu -27(%eax), %xmm1 729 movdqu -27(%edx), %xmm2 730 pxor %xmm1, %xmm2 731 ptest %xmm2, %xmm0 732 jnc L(less16bytes) 733 movl -11(%eax), %ecx 734 movl -11(%edx), %ebx 735 cmp %ebx, %ecx 736 jne L(find_diff) 737 movl -7(%eax), %ecx 738 movl -7(%edx), %ebx 739 cmp %ebx, %ecx 740 jne L(find_diff) 741 movzwl -3(%eax), %ecx 742 movzwl -3(%edx), %ebx 743 cmpb %bl, %cl 744 jne L(end) 745 cmp %bx, %cx 746 jne L(end) 747 movzbl -1(%eax), %eax 748 cmpb -1(%edx), %al 749 mov $0, %eax 750 jne L(end) 751 RETURN 752# endif 753 .p2align 4 754L(60bytes): 755 movdqu -60(%eax), %xmm1 756 movdqu -60(%edx), %xmm2 757 mov $-60, %ebx 758 pxor %xmm1, %xmm2 759 ptest %xmm2, %xmm0 760 jnc L(less16bytes) 761L(44bytes): 762 mov $-44, %ebx 763 movdqu -44(%eax), %xmm1 764 movdqu -44(%edx), %xmm2 765 pxor %xmm1, %xmm2 766 ptest %xmm2, %xmm0 767 jnc L(less16bytes) 768L(28bytes): 769 mov $-28, %ebx 770 movdqu -28(%eax), %xmm1 771 movdqu -28(%edx), %xmm2 772 pxor %xmm1, %xmm2 773 ptest %xmm2, %xmm0 774 jnc L(less16bytes) 775 776 mov -12(%eax), %ecx 777# ifndef USE_AS_WMEMCMP 778 mov -12(%edx), %ebx 779 cmp %ebx, %ecx 780# else 781 cmp -12(%edx), %ecx 782# endif 783 jne L(find_diff) 784 785 mov -8(%eax), %ecx 786# ifndef USE_AS_WMEMCMP 787 mov -8(%edx), %ebx 788 cmp %ebx, %ecx 789# else 790 cmp -8(%edx), %ecx 791# endif 792 jne L(find_diff) 793 794 mov -4(%eax), %ecx 795# ifndef USE_AS_WMEMCMP 796 mov -4(%edx), %ebx 797 cmp %ebx, %ecx 798# else 799 cmp -4(%edx), %ecx 800# endif 801 mov $0, %eax 802 jne L(find_diff) 803 RETURN 804 805# ifndef USE_AS_WMEMCMP 806 .p2align 4 807L(61bytes): 808 movdqu -61(%eax), %xmm1 809 movdqu -61(%edx), %xmm2 810 mov $-61, %ebx 811 pxor %xmm1, %xmm2 812 ptest %xmm2, %xmm0 813 jnc L(less16bytes) 814L(45bytes): 815 mov $-45, %ebx 816 movdqu -45(%eax), %xmm1 817 movdqu -45(%edx), %xmm2 818 pxor %xmm1, %xmm2 819 ptest %xmm2, %xmm0 820 jnc L(less16bytes) 821L(29bytes): 822 mov $-29, %ebx 823 movdqu -29(%eax), %xmm1 824 movdqu -29(%edx), %xmm2 825 pxor %xmm1, %xmm2 826 ptest %xmm2, %xmm0 827 jnc L(less16bytes) 828 829 mov -13(%eax), %ecx 830 mov -13(%edx), %ebx 831 cmp %ebx, %ecx 832 jne L(find_diff) 833 834 mov -9(%eax), %ecx 835 mov -9(%edx), %ebx 836 cmp %ebx, %ecx 837 jne L(find_diff) 838 839 mov -5(%eax), %ecx 840 mov -5(%edx), %ebx 841 cmp %ebx, %ecx 842 jne L(find_diff) 843 movzbl -1(%eax), %ecx 844 cmp -1(%edx), %cl 845 mov $0, %eax 846 jne L(end) 847 RETURN 848 849 .p2align 4 850L(62bytes): 851 movdqu -62(%eax), %xmm1 852 movdqu -62(%edx), %xmm2 853 mov $-62, %ebx 854 pxor %xmm1, %xmm2 855 ptest %xmm2, %xmm0 856 jnc L(less16bytes) 857L(46bytes): 858 mov $-46, %ebx 859 movdqu -46(%eax), %xmm1 860 movdqu -46(%edx), %xmm2 861 pxor %xmm1, %xmm2 862 ptest %xmm2, %xmm0 863 jnc L(less16bytes) 864L(30bytes): 865 mov $-30, %ebx 866 movdqu -30(%eax), %xmm1 867 movdqu -30(%edx), %xmm2 868 pxor %xmm1, %xmm2 869 ptest %xmm2, %xmm0 870 jnc L(less16bytes) 871 mov -14(%eax), %ecx 872 mov -14(%edx), %ebx 873 cmp %ebx, %ecx 874 jne L(find_diff) 875 mov -10(%eax), %ecx 876 mov -10(%edx), %ebx 877 cmp %ebx, %ecx 878 jne L(find_diff) 879 mov -6(%eax), %ecx 880 mov -6(%edx), %ebx 881 cmp %ebx, %ecx 882 jne L(find_diff) 883 movzwl -2(%eax), %ecx 884 movzwl -2(%edx), %ebx 885 cmp %bl, %cl 886 jne L(end) 887 cmp %bh, %ch 888 mov $0, %eax 889 jne L(end) 890 RETURN 891 892 .p2align 4 893L(63bytes): 894 movdqu -63(%eax), %xmm1 895 movdqu -63(%edx), %xmm2 896 mov $-63, %ebx 897 pxor %xmm1, %xmm2 898 ptest %xmm2, %xmm0 899 jnc L(less16bytes) 900L(47bytes): 901 mov $-47, %ebx 902 movdqu -47(%eax), %xmm1 903 movdqu -47(%edx), %xmm2 904 pxor %xmm1, %xmm2 905 ptest %xmm2, %xmm0 906 jnc L(less16bytes) 907L(31bytes): 908 mov $-31, %ebx 909 movdqu -31(%eax), %xmm1 910 movdqu -31(%edx), %xmm2 911 pxor %xmm1, %xmm2 912 ptest %xmm2, %xmm0 913 jnc L(less16bytes) 914 915 movl -15(%eax), %ecx 916 movl -15(%edx), %ebx 917 cmp %ebx, %ecx 918 jne L(find_diff) 919 movl -11(%eax), %ecx 920 movl -11(%edx), %ebx 921 cmp %ebx, %ecx 922 jne L(find_diff) 923 movl -7(%eax), %ecx 924 movl -7(%edx), %ebx 925 cmp %ebx, %ecx 926 jne L(find_diff) 927 movzwl -3(%eax), %ecx 928 movzwl -3(%edx), %ebx 929 cmpb %bl, %cl 930 jne L(end) 931 cmp %bx, %cx 932 jne L(end) 933 movzbl -1(%eax), %eax 934 cmpb -1(%edx), %al 935 mov $0, %eax 936 jne L(end) 937 RETURN 938# endif 939 940 .p2align 4 941L(64bytes): 942 movdqu -64(%eax), %xmm1 943 movdqu -64(%edx), %xmm2 944 mov $-64, %ebx 945 pxor %xmm1, %xmm2 946 ptest %xmm2, %xmm0 947 jnc L(less16bytes) 948L(48bytes): 949 movdqu -48(%eax), %xmm1 950 movdqu -48(%edx), %xmm2 951 mov $-48, %ebx 952 pxor %xmm1, %xmm2 953 ptest %xmm2, %xmm0 954 jnc L(less16bytes) 955L(32bytes): 956 movdqu -32(%eax), %xmm1 957 movdqu -32(%edx), %xmm2 958 mov $-32, %ebx 959 pxor %xmm1, %xmm2 960 ptest %xmm2, %xmm0 961 jnc L(less16bytes) 962 963 mov -16(%eax), %ecx 964# ifndef USE_AS_WMEMCMP 965 mov -16(%edx), %ebx 966 cmp %ebx, %ecx 967# else 968 cmp -16(%edx), %ecx 969# endif 970 jne L(find_diff) 971 972 mov -12(%eax), %ecx 973# ifndef USE_AS_WMEMCMP 974 mov -12(%edx), %ebx 975 cmp %ebx, %ecx 976# else 977 cmp -12(%edx), %ecx 978# endif 979 jne L(find_diff) 980 981 mov -8(%eax), %ecx 982# ifndef USE_AS_WMEMCMP 983 mov -8(%edx), %ebx 984 cmp %ebx, %ecx 985# else 986 cmp -8(%edx), %ecx 987# endif 988 jne L(find_diff) 989 990 mov -4(%eax), %ecx 991# ifndef USE_AS_WMEMCMP 992 mov -4(%edx), %ebx 993 cmp %ebx, %ecx 994# else 995 cmp -4(%edx), %ecx 996# endif 997 mov $0, %eax 998 jne L(find_diff) 999 RETURN 1000 1001# ifndef USE_AS_WMEMCMP 1002 .p2align 4 1003L(less16bytes): 1004 add %ebx, %eax 1005 add %ebx, %edx 1006 1007 mov (%eax), %ecx 1008 mov (%edx), %ebx 1009 cmp %ebx, %ecx 1010 jne L(find_diff) 1011 1012 mov 4(%eax), %ecx 1013 mov 4(%edx), %ebx 1014 cmp %ebx, %ecx 1015 jne L(find_diff) 1016 1017 mov 8(%eax), %ecx 1018 mov 8(%edx), %ebx 1019 cmp %ebx, %ecx 1020 jne L(find_diff) 1021 1022 mov 12(%eax), %ecx 1023 mov 12(%edx), %ebx 1024 cmp %ebx, %ecx 1025 mov $0, %eax 1026 jne L(find_diff) 1027 RETURN 1028# else 1029 .p2align 4 1030L(less16bytes): 1031 add %ebx, %eax 1032 add %ebx, %edx 1033 1034 mov (%eax), %ecx 1035 cmp (%edx), %ecx 1036 jne L(find_diff) 1037 1038 mov 4(%eax), %ecx 1039 cmp 4(%edx), %ecx 1040 jne L(find_diff) 1041 1042 mov 8(%eax), %ecx 1043 cmp 8(%edx), %ecx 1044 jne L(find_diff) 1045 1046 mov 12(%eax), %ecx 1047 cmp 12(%edx), %ecx 1048 1049 mov $0, %eax 1050 jne L(find_diff) 1051 RETURN 1052# endif 1053 1054 .p2align 4 1055L(find_diff): 1056# ifndef USE_AS_WMEMCMP 1057 cmpb %bl, %cl 1058 jne L(end) 1059 cmp %bx, %cx 1060 jne L(end) 1061 shr $16,%ecx 1062 shr $16,%ebx 1063 cmp %bl, %cl 1064 jne L(end) 1065 cmp %bx, %cx 1066L(end): 1067 POP (%ebx) 1068 mov $1, %eax 1069 ja L(bigger) 1070 neg %eax 1071L(bigger): 1072 ret 1073# else 1074 POP (%ebx) 1075 mov $1, %eax 1076 jg L(bigger) 1077 neg %eax 1078 ret 1079 1080 .p2align 4 1081L(bigger): 1082 ret 1083# endif 1084END (MEMCMP) 1085 1086 .section .rodata.sse4.2,"a",@progbits 1087 .p2align 2 1088 .type L(table_64bytes), @object 1089# ifndef USE_AS_WMEMCMP 1090L(table_64bytes): 1091 .int JMPTBL (L(0bytes), L(table_64bytes)) 1092 .int JMPTBL (L(1bytes), L(table_64bytes)) 1093 .int JMPTBL (L(2bytes), L(table_64bytes)) 1094 .int JMPTBL (L(3bytes), L(table_64bytes)) 1095 .int JMPTBL (L(4bytes), L(table_64bytes)) 1096 .int JMPTBL (L(5bytes), L(table_64bytes)) 1097 .int JMPTBL (L(6bytes), L(table_64bytes)) 1098 .int JMPTBL (L(7bytes), L(table_64bytes)) 1099 .int JMPTBL (L(8bytes), L(table_64bytes)) 1100 .int JMPTBL (L(9bytes), L(table_64bytes)) 1101 .int JMPTBL (L(10bytes), L(table_64bytes)) 1102 .int JMPTBL (L(11bytes), L(table_64bytes)) 1103 .int JMPTBL (L(12bytes), L(table_64bytes)) 1104 .int JMPTBL (L(13bytes), L(table_64bytes)) 1105 .int JMPTBL (L(14bytes), L(table_64bytes)) 1106 .int JMPTBL (L(15bytes), L(table_64bytes)) 1107 .int JMPTBL (L(16bytes), L(table_64bytes)) 1108 .int JMPTBL (L(17bytes), L(table_64bytes)) 1109 .int JMPTBL (L(18bytes), L(table_64bytes)) 1110 .int JMPTBL (L(19bytes), L(table_64bytes)) 1111 .int JMPTBL (L(20bytes), L(table_64bytes)) 1112 .int JMPTBL (L(21bytes), L(table_64bytes)) 1113 .int JMPTBL (L(22bytes), L(table_64bytes)) 1114 .int JMPTBL (L(23bytes), L(table_64bytes)) 1115 .int JMPTBL (L(24bytes), L(table_64bytes)) 1116 .int JMPTBL (L(25bytes), L(table_64bytes)) 1117 .int JMPTBL (L(26bytes), L(table_64bytes)) 1118 .int JMPTBL (L(27bytes), L(table_64bytes)) 1119 .int JMPTBL (L(28bytes), L(table_64bytes)) 1120 .int JMPTBL (L(29bytes), L(table_64bytes)) 1121 .int JMPTBL (L(30bytes), L(table_64bytes)) 1122 .int JMPTBL (L(31bytes), L(table_64bytes)) 1123 .int JMPTBL (L(32bytes), L(table_64bytes)) 1124 .int JMPTBL (L(33bytes), L(table_64bytes)) 1125 .int JMPTBL (L(34bytes), L(table_64bytes)) 1126 .int JMPTBL (L(35bytes), L(table_64bytes)) 1127 .int JMPTBL (L(36bytes), L(table_64bytes)) 1128 .int JMPTBL (L(37bytes), L(table_64bytes)) 1129 .int JMPTBL (L(38bytes), L(table_64bytes)) 1130 .int JMPTBL (L(39bytes), L(table_64bytes)) 1131 .int JMPTBL (L(40bytes), L(table_64bytes)) 1132 .int JMPTBL (L(41bytes), L(table_64bytes)) 1133 .int JMPTBL (L(42bytes), L(table_64bytes)) 1134 .int JMPTBL (L(43bytes), L(table_64bytes)) 1135 .int JMPTBL (L(44bytes), L(table_64bytes)) 1136 .int JMPTBL (L(45bytes), L(table_64bytes)) 1137 .int JMPTBL (L(46bytes), L(table_64bytes)) 1138 .int JMPTBL (L(47bytes), L(table_64bytes)) 1139 .int JMPTBL (L(48bytes), L(table_64bytes)) 1140 .int JMPTBL (L(49bytes), L(table_64bytes)) 1141 .int JMPTBL (L(50bytes), L(table_64bytes)) 1142 .int JMPTBL (L(51bytes), L(table_64bytes)) 1143 .int JMPTBL (L(52bytes), L(table_64bytes)) 1144 .int JMPTBL (L(53bytes), L(table_64bytes)) 1145 .int JMPTBL (L(54bytes), L(table_64bytes)) 1146 .int JMPTBL (L(55bytes), L(table_64bytes)) 1147 .int JMPTBL (L(56bytes), L(table_64bytes)) 1148 .int JMPTBL (L(57bytes), L(table_64bytes)) 1149 .int JMPTBL (L(58bytes), L(table_64bytes)) 1150 .int JMPTBL (L(59bytes), L(table_64bytes)) 1151 .int JMPTBL (L(60bytes), L(table_64bytes)) 1152 .int JMPTBL (L(61bytes), L(table_64bytes)) 1153 .int JMPTBL (L(62bytes), L(table_64bytes)) 1154 .int JMPTBL (L(63bytes), L(table_64bytes)) 1155 .int JMPTBL (L(64bytes), L(table_64bytes)) 1156# else 1157L(table_64bytes): 1158 .int JMPTBL (L(0bytes), L(table_64bytes)) 1159 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1160 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1161 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1162 .int JMPTBL (L(4bytes), L(table_64bytes)) 1163 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1164 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1165 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1166 .int JMPTBL (L(8bytes), L(table_64bytes)) 1167 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1168 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1169 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1170 .int JMPTBL (L(12bytes), L(table_64bytes)) 1171 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1172 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1173 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1174 .int JMPTBL (L(16bytes), L(table_64bytes)) 1175 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1176 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1177 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1178 .int JMPTBL (L(20bytes), L(table_64bytes)) 1179 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1180 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1181 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1182 .int JMPTBL (L(24bytes), L(table_64bytes)) 1183 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1184 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1185 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1186 .int JMPTBL (L(28bytes), L(table_64bytes)) 1187 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1188 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1189 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1190 .int JMPTBL (L(32bytes), L(table_64bytes)) 1191 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1192 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1193 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1194 .int JMPTBL (L(36bytes), L(table_64bytes)) 1195 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1196 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1197 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1198 .int JMPTBL (L(40bytes), L(table_64bytes)) 1199 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1200 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1201 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1202 .int JMPTBL (L(44bytes), L(table_64bytes)) 1203 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1204 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1205 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1206 .int JMPTBL (L(48bytes), L(table_64bytes)) 1207 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1208 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1209 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1210 .int JMPTBL (L(52bytes), L(table_64bytes)) 1211 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1212 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1213 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1214 .int JMPTBL (L(56bytes), L(table_64bytes)) 1215 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1216 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1217 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1218 .int JMPTBL (L(60bytes), L(table_64bytes)) 1219 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1220 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1221 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1222 .int JMPTBL (L(64bytes), L(table_64bytes)) 1223# endif 1224#endif 1225