1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. 2 Copyright (C) 2018-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23# ifndef STRCMP_ISA 24# define STRCMP_ISA _avx2 25# endif 26 27# include "strcmp-naming.h" 28 29# include <sysdep.h> 30 31# if defined USE_AS_STRCASECMP_L 32# include "locale-defines.h" 33# endif 34 35# ifndef STRCMP 36# define STRCMP __strcmp_avx2 37# endif 38 39# define PAGE_SIZE 4096 40 41 /* VEC_SIZE = Number of bytes in a ymm register. */ 42# define VEC_SIZE 32 43 44# define VMOVU vmovdqu 45# define VMOVA vmovdqa 46 47# ifdef USE_AS_WCSCMP 48 /* Compare packed dwords. */ 49# define VPCMPEQ vpcmpeqd 50 /* Compare packed dwords and store minimum. */ 51# define VPMINU vpminud 52 /* 1 dword char == 4 bytes. */ 53# define SIZE_OF_CHAR 4 54# else 55 /* Compare packed bytes. */ 56# define VPCMPEQ vpcmpeqb 57 /* Compare packed bytes and store minimum. */ 58# define VPMINU vpminub 59 /* 1 byte char == 1 byte. */ 60# define SIZE_OF_CHAR 1 61# endif 62 63# ifdef USE_AS_STRNCMP 64# define LOOP_REG r9d 65# define LOOP_REG64 r9 66 67# define OFFSET_REG8 r9b 68# define OFFSET_REG r9d 69# define OFFSET_REG64 r9 70# else 71# define LOOP_REG edx 72# define LOOP_REG64 rdx 73 74# define OFFSET_REG8 dl 75# define OFFSET_REG edx 76# define OFFSET_REG64 rdx 77# endif 78 79# ifndef VZEROUPPER 80# define VZEROUPPER vzeroupper 81# endif 82 83# if defined USE_AS_STRNCMP 84# define VEC_OFFSET 0 85# else 86# define VEC_OFFSET (-VEC_SIZE) 87# endif 88 89# ifdef USE_AS_STRCASECMP_L 90# define BYTE_LOOP_REG OFFSET_REG 91# else 92# define BYTE_LOOP_REG ecx 93# endif 94 95# ifdef USE_AS_STRCASECMP_L 96# ifdef USE_AS_STRNCMP 97# define LOCALE_REG rcx 98# define LOCALE_REG_LP RCX_LP 99# else 100# define LOCALE_REG rdx 101# define LOCALE_REG_LP RDX_LP 102# endif 103# endif 104 105# define xmmZERO xmm15 106# define ymmZERO ymm15 107 108# define LCASE_MIN_ymm %ymm10 109# define LCASE_MAX_ymm %ymm11 110# define CASE_ADD_ymm %ymm12 111 112# define LCASE_MIN_xmm %xmm10 113# define LCASE_MAX_xmm %xmm11 114# define CASE_ADD_xmm %xmm12 115 116 /* r11 is never use elsewhere so this is safe to maintain. */ 117# define TOLOWER_BASE %r11 118 119# ifndef SECTION 120# define SECTION(p) p##.avx 121# endif 122 123# ifdef USE_AS_STRCASECMP_L 124# define REG(x, y) x ## y 125# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ 126 vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ 127 vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ 128 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ 129 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ 130 vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ 131 vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ 132 vpaddb REG(%ext, 8), reg1_in, reg1_out; \ 133 vpaddb REG(%ext, 9), reg2_in, reg2_out 134 135# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst 136# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) 137# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) 138 139# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ 140 TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ 141 VPCMPEQ scratch_reg, s2_reg, reg_out 142 143# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ 144 VMOVU s2_mem, reg_out; \ 145 CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) 146 147# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) 148# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) 149 150# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) 151# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) 152 153# else 154# define TOLOWER_gpr(...) 155# define TOLOWER_ymm(...) 156# define TOLOWER_xmm(...) 157 158# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ 159 VPCMPEQ s2_reg, s1_reg, reg_out 160 161# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) 162 163# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) 164# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) 165# endif 166 167/* Warning! 168 wcscmp/wcsncmp have to use SIGNED comparison for elements. 169 strcmp/strncmp have to use UNSIGNED comparison for elements. 170*/ 171 172/* The main idea of the string comparison (byte or dword) using AVX2 173 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on 174 either packed bytes or dwords depending on USE_AS_WCSCMP. In order 175 to check the null char, algorithm keeps the matched bytes/dwords, 176 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, 177 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and 178 one VPMINU instructions, together with movdqu and testl instructions. 179 Main loop (away from from page boundary) compares 4 vectors are a time, 180 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. 181 182 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic 183 is the same as strcmp, except that an a maximum offset is tracked. If 184 the maximum offset is reached before a difference is found, zero is 185 returned. */ 186 187 .section SECTION(.text), "ax", @progbits 188 .align 16 189 .type STRCMP, @function 190 .globl STRCMP 191 192# ifdef USE_AS_STRCASECMP_L 193ENTRY (STRCASECMP) 194 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax 195 mov %fs:(%rax), %LOCALE_REG_LP 196 197 /* Either 1 or 5 bytes (dependeing if CET is enabled). */ 198 .p2align 4 199END (STRCASECMP) 200 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ 201# endif 202 203 .p2align 4 204STRCMP: 205 cfi_startproc 206 _CET_ENDBR 207 CALL_MCOUNT 208 209# if defined USE_AS_STRCASECMP_L 210 /* We have to fall back on the C implementation for locales with 211 encodings not matching ASCII for single bytes. */ 212# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 213 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP 214# else 215 mov (%LOCALE_REG), %RAX_LP 216# endif 217 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) 218 jne STRCASECMP_L_NONASCII 219 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE 220# endif 221 222# ifdef USE_AS_STRNCMP 223 /* Don't overwrite LOCALE_REG (rcx) until we have pass 224 L(one_or_less). Otherwise we might use the wrong locale in 225 the OVERFLOW_STRCMP (strcasecmp_l). */ 226# ifdef __ILP32__ 227 /* Clear the upper 32 bits. */ 228 movl %edx, %edx 229# endif 230 cmp $1, %RDX_LP 231 /* Signed comparison intentional. We use this branch to also 232 test cases where length >= 2^63. These very large sizes can be 233 handled with strcmp as there is no way for that length to 234 actually bound the buffer. */ 235 jle L(one_or_less) 236# ifdef USE_AS_WCSCMP 237 movq %rdx, %rcx 238 239 /* Multiplying length by sizeof(wchar_t) can result in overflow. 240 Check if that is possible. All cases where overflow are possible 241 are cases where length is large enough that it can never be a 242 bound on valid memory so just use wcscmp. */ 243 shrq $56, %rcx 244 jnz OVERFLOW_STRCMP 245 246 leaq (, %rdx, 4), %rdx 247# endif 248# endif 249 vpxor %xmmZERO, %xmmZERO, %xmmZERO 250# if defined USE_AS_STRCASECMP_L 251 .section .rodata.cst32, "aM", @progbits, 32 252 .align 32 253L(lcase_min): 254 .quad 0x3f3f3f3f3f3f3f3f 255 .quad 0x3f3f3f3f3f3f3f3f 256 .quad 0x3f3f3f3f3f3f3f3f 257 .quad 0x3f3f3f3f3f3f3f3f 258L(lcase_max): 259 .quad 0x9999999999999999 260 .quad 0x9999999999999999 261 .quad 0x9999999999999999 262 .quad 0x9999999999999999 263L(case_add): 264 .quad 0x2020202020202020 265 .quad 0x2020202020202020 266 .quad 0x2020202020202020 267 .quad 0x2020202020202020 268 .previous 269 270 vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm 271 vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm 272 vmovdqa L(case_add)(%rip), CASE_ADD_ymm 273# endif 274 movl %edi, %eax 275 orl %esi, %eax 276 sall $20, %eax 277 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ 278 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax 279 ja L(page_cross) 280 281L(no_page_cross): 282 /* Safe to compare 4x vectors. */ 283 VMOVU (%rdi), %ymm0 284 /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. 285 Otherwise converts ymm0 and load from rsi to lower. ymm2 is 286 scratch and ymm1 is the return. */ 287 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) 288 /* 1s at null CHAR. */ 289 VPCMPEQ %ymm0, %ymmZERO, %ymm2 290 /* 1s where s1 and s2 equal AND not null CHAR. */ 291 vpandn %ymm1, %ymm2, %ymm1 292 293 /* All 1s -> keep going, any 0s -> return. */ 294 vpmovmskb %ymm1, %ecx 295# ifdef USE_AS_STRNCMP 296 cmpq $VEC_SIZE, %rdx 297 jbe L(vec_0_test_len) 298# endif 299 300 /* All 1s represents all equals. incl will overflow to zero in 301 all equals case. Otherwise 1s will carry until position of first 302 mismatch. */ 303 incl %ecx 304 jz L(more_3x_vec) 305 306 .p2align 4,, 4 307L(return_vec_0): 308 tzcntl %ecx, %ecx 309# ifdef USE_AS_WCSCMP 310 movl (%rdi, %rcx), %edx 311 xorl %eax, %eax 312 cmpl (%rsi, %rcx), %edx 313 je L(ret0) 314 setl %al 315 negl %eax 316 orl $1, %eax 317# else 318 movzbl (%rdi, %rcx), %eax 319 movzbl (%rsi, %rcx), %ecx 320 TOLOWER_gpr (%rax, %eax) 321 TOLOWER_gpr (%rcx, %ecx) 322 subl %ecx, %eax 323# endif 324L(ret0): 325L(return_vzeroupper): 326 ZERO_UPPER_VEC_REGISTERS_RETURN 327 328# ifdef USE_AS_STRNCMP 329 .p2align 4,, 8 330L(vec_0_test_len): 331 notl %ecx 332 bzhil %edx, %ecx, %eax 333 jnz L(return_vec_0) 334 /* Align if will cross fetch block. */ 335 .p2align 4,, 2 336L(ret_zero): 337 xorl %eax, %eax 338 VZEROUPPER_RETURN 339 340 .p2align 4,, 5 341L(one_or_less): 342# ifdef USE_AS_STRCASECMP_L 343 /* Set locale argument for strcasecmp. */ 344 movq %LOCALE_REG, %rdx 345# endif 346 jb L(ret_zero) 347 /* 'nbe' covers the case where length is negative (large 348 unsigned). */ 349 jnbe OVERFLOW_STRCMP 350# ifdef USE_AS_WCSCMP 351 movl (%rdi), %edx 352 xorl %eax, %eax 353 cmpl (%rsi), %edx 354 je L(ret1) 355 setl %al 356 negl %eax 357 orl $1, %eax 358# else 359 movzbl (%rdi), %eax 360 movzbl (%rsi), %ecx 361 TOLOWER_gpr (%rax, %eax) 362 TOLOWER_gpr (%rcx, %ecx) 363 subl %ecx, %eax 364# endif 365L(ret1): 366 ret 367# endif 368 369 .p2align 4,, 10 370L(return_vec_1): 371 tzcntl %ecx, %ecx 372# ifdef USE_AS_STRNCMP 373 /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of 374 overflow. */ 375 addq $-VEC_SIZE, %rdx 376 cmpq %rcx, %rdx 377 jbe L(ret_zero) 378# endif 379# ifdef USE_AS_WCSCMP 380 movl VEC_SIZE(%rdi, %rcx), %edx 381 xorl %eax, %eax 382 cmpl VEC_SIZE(%rsi, %rcx), %edx 383 je L(ret2) 384 setl %al 385 negl %eax 386 orl $1, %eax 387# else 388 movzbl VEC_SIZE(%rdi, %rcx), %eax 389 movzbl VEC_SIZE(%rsi, %rcx), %ecx 390 TOLOWER_gpr (%rax, %eax) 391 TOLOWER_gpr (%rcx, %ecx) 392 subl %ecx, %eax 393# endif 394L(ret2): 395 VZEROUPPER_RETURN 396 397 .p2align 4,, 10 398# ifdef USE_AS_STRNCMP 399L(return_vec_3): 400 salq $32, %rcx 401# endif 402 403L(return_vec_2): 404# ifndef USE_AS_STRNCMP 405 tzcntl %ecx, %ecx 406# else 407 tzcntq %rcx, %rcx 408 cmpq %rcx, %rdx 409 jbe L(ret_zero) 410# endif 411 412# ifdef USE_AS_WCSCMP 413 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx 414 xorl %eax, %eax 415 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx 416 je L(ret3) 417 setl %al 418 negl %eax 419 orl $1, %eax 420# else 421 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax 422 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx 423 TOLOWER_gpr (%rax, %eax) 424 TOLOWER_gpr (%rcx, %ecx) 425 subl %ecx, %eax 426# endif 427L(ret3): 428 VZEROUPPER_RETURN 429 430# ifndef USE_AS_STRNCMP 431 .p2align 4,, 10 432L(return_vec_3): 433 tzcntl %ecx, %ecx 434# ifdef USE_AS_WCSCMP 435 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx 436 xorl %eax, %eax 437 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx 438 je L(ret4) 439 setl %al 440 negl %eax 441 orl $1, %eax 442# else 443 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 444 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 445 TOLOWER_gpr (%rax, %eax) 446 TOLOWER_gpr (%rcx, %ecx) 447 subl %ecx, %eax 448# endif 449L(ret4): 450 VZEROUPPER_RETURN 451# endif 452 453 .p2align 4,, 10 454L(more_3x_vec): 455 /* Safe to compare 4x vectors. */ 456 VMOVU VEC_SIZE(%rdi), %ymm0 457 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) 458 VPCMPEQ %ymm0, %ymmZERO, %ymm2 459 vpandn %ymm1, %ymm2, %ymm1 460 vpmovmskb %ymm1, %ecx 461 incl %ecx 462 jnz L(return_vec_1) 463 464# ifdef USE_AS_STRNCMP 465 subq $(VEC_SIZE * 2), %rdx 466 jbe L(ret_zero) 467# endif 468 469 VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 470 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) 471 VPCMPEQ %ymm0, %ymmZERO, %ymm2 472 vpandn %ymm1, %ymm2, %ymm1 473 vpmovmskb %ymm1, %ecx 474 incl %ecx 475 jnz L(return_vec_2) 476 477 VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 478 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) 479 VPCMPEQ %ymm0, %ymmZERO, %ymm2 480 vpandn %ymm1, %ymm2, %ymm1 481 vpmovmskb %ymm1, %ecx 482 incl %ecx 483 jnz L(return_vec_3) 484 485# ifdef USE_AS_STRNCMP 486 cmpq $(VEC_SIZE * 2), %rdx 487 jbe L(ret_zero) 488# endif 489 490# ifdef USE_AS_WCSCMP 491 /* any non-zero positive value that doesn't inference with 0x1. 492 */ 493 movl $2, %r8d 494 495# else 496 xorl %r8d, %r8d 497# endif 498 499 /* The prepare labels are various entry points from the page 500 cross logic. */ 501L(prepare_loop): 502 503# ifdef USE_AS_STRNCMP 504 /* Store N + (VEC_SIZE * 4) and place check at the begining of 505 the loop. */ 506 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx 507# endif 508L(prepare_loop_no_len): 509 510 /* Align s1 and adjust s2 accordingly. */ 511 subq %rdi, %rsi 512 andq $-(VEC_SIZE * 4), %rdi 513 addq %rdi, %rsi 514 515# ifdef USE_AS_STRNCMP 516 subq %rdi, %rdx 517# endif 518 519L(prepare_loop_aligned): 520 /* eax stores distance from rsi to next page cross. These cases 521 need to be handled specially as the 4x loop could potentially 522 read memory past the length of s1 or s2 and across a page 523 boundary. */ 524 movl $-(VEC_SIZE * 4), %eax 525 subl %esi, %eax 526 andl $(PAGE_SIZE - 1), %eax 527 528 /* Loop 4x comparisons at a time. */ 529 .p2align 4 530L(loop): 531 532 /* End condition for strncmp. */ 533# ifdef USE_AS_STRNCMP 534 subq $(VEC_SIZE * 4), %rdx 535 jbe L(ret_zero) 536# endif 537 538 subq $-(VEC_SIZE * 4), %rdi 539 subq $-(VEC_SIZE * 4), %rsi 540 541 /* Check if rsi loads will cross a page boundary. */ 542 addl $-(VEC_SIZE * 4), %eax 543 jnb L(page_cross_during_loop) 544 545 /* Loop entry after handling page cross during loop. */ 546L(loop_skip_page_cross_check): 547 VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 548 VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 549 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 550 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 551 552 /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ 553 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) 554 CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) 555 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) 556 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) 557 558 /* If any mismatches or null CHAR then 0 CHAR, otherwise non- 559 zero. */ 560 vpand %ymm0, %ymm1, %ymm1 561 562 563 vpand %ymm2, %ymm3, %ymm3 564 vpand %ymm4, %ymm5, %ymm5 565 vpand %ymm6, %ymm7, %ymm7 566 567 VPMINU %ymm1, %ymm3, %ymm3 568 VPMINU %ymm5, %ymm7, %ymm7 569 570 /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ 571 VPMINU %ymm3, %ymm7, %ymm7 572 573 /* If any 0 CHAR then done. */ 574 VPCMPEQ %ymm7, %ymmZERO, %ymm7 575 vpmovmskb %ymm7, %LOOP_REG 576 testl %LOOP_REG, %LOOP_REG 577 jz L(loop) 578 579 /* Find which VEC has the mismatch of end of string. */ 580 VPCMPEQ %ymm1, %ymmZERO, %ymm1 581 vpmovmskb %ymm1, %ecx 582 testl %ecx, %ecx 583 jnz L(return_vec_0_end) 584 585 586 VPCMPEQ %ymm3, %ymmZERO, %ymm3 587 vpmovmskb %ymm3, %ecx 588 testl %ecx, %ecx 589 jnz L(return_vec_1_end) 590 591L(return_vec_2_3_end): 592# ifdef USE_AS_STRNCMP 593 subq $(VEC_SIZE * 2), %rdx 594 jbe L(ret_zero_end) 595# endif 596 597 VPCMPEQ %ymm5, %ymmZERO, %ymm5 598 vpmovmskb %ymm5, %ecx 599 testl %ecx, %ecx 600 jnz L(return_vec_2_end) 601 602 /* LOOP_REG contains matches for null/mismatch from the loop. If 603 VEC 0,1,and 2 all have no null and no mismatches then mismatch 604 must entirely be from VEC 3 which is fully represented by 605 LOOP_REG. */ 606 tzcntl %LOOP_REG, %LOOP_REG 607 608# ifdef USE_AS_STRNCMP 609 subl $-(VEC_SIZE), %LOOP_REG 610 cmpq %LOOP_REG64, %rdx 611 jbe L(ret_zero_end) 612# endif 613 614# ifdef USE_AS_WCSCMP 615 movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx 616 xorl %eax, %eax 617 cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx 618 je L(ret5) 619 setl %al 620 negl %eax 621 xorl %r8d, %eax 622# else 623 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax 624 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx 625 TOLOWER_gpr (%rax, %eax) 626 TOLOWER_gpr (%rcx, %ecx) 627 subl %ecx, %eax 628 xorl %r8d, %eax 629 subl %r8d, %eax 630# endif 631L(ret5): 632 VZEROUPPER_RETURN 633 634# ifdef USE_AS_STRNCMP 635 .p2align 4,, 2 636L(ret_zero_end): 637 xorl %eax, %eax 638 VZEROUPPER_RETURN 639# endif 640 641 642 /* The L(return_vec_N_end) differ from L(return_vec_N) in that 643 they use the value of `r8` to negate the return value. This is 644 because the page cross logic can swap `rdi` and `rsi`. */ 645 .p2align 4,, 10 646# ifdef USE_AS_STRNCMP 647L(return_vec_1_end): 648 salq $32, %rcx 649# endif 650L(return_vec_0_end): 651# ifndef USE_AS_STRNCMP 652 tzcntl %ecx, %ecx 653# else 654 tzcntq %rcx, %rcx 655 cmpq %rcx, %rdx 656 jbe L(ret_zero_end) 657# endif 658 659# ifdef USE_AS_WCSCMP 660 movl (%rdi, %rcx), %edx 661 xorl %eax, %eax 662 cmpl (%rsi, %rcx), %edx 663 je L(ret6) 664 setl %al 665 negl %eax 666 xorl %r8d, %eax 667# else 668 movzbl (%rdi, %rcx), %eax 669 movzbl (%rsi, %rcx), %ecx 670 TOLOWER_gpr (%rax, %eax) 671 TOLOWER_gpr (%rcx, %ecx) 672 subl %ecx, %eax 673 xorl %r8d, %eax 674 subl %r8d, %eax 675# endif 676L(ret6): 677 VZEROUPPER_RETURN 678 679# ifndef USE_AS_STRNCMP 680 .p2align 4,, 10 681L(return_vec_1_end): 682 tzcntl %ecx, %ecx 683# ifdef USE_AS_WCSCMP 684 movl VEC_SIZE(%rdi, %rcx), %edx 685 xorl %eax, %eax 686 cmpl VEC_SIZE(%rsi, %rcx), %edx 687 je L(ret7) 688 setl %al 689 negl %eax 690 xorl %r8d, %eax 691# else 692 movzbl VEC_SIZE(%rdi, %rcx), %eax 693 movzbl VEC_SIZE(%rsi, %rcx), %ecx 694 TOLOWER_gpr (%rax, %eax) 695 TOLOWER_gpr (%rcx, %ecx) 696 subl %ecx, %eax 697 xorl %r8d, %eax 698 subl %r8d, %eax 699# endif 700L(ret7): 701 VZEROUPPER_RETURN 702# endif 703 704 .p2align 4,, 10 705L(return_vec_2_end): 706 tzcntl %ecx, %ecx 707# ifdef USE_AS_STRNCMP 708 cmpq %rcx, %rdx 709 jbe L(ret_zero_page_cross) 710# endif 711# ifdef USE_AS_WCSCMP 712 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx 713 xorl %eax, %eax 714 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx 715 je L(ret11) 716 setl %al 717 negl %eax 718 xorl %r8d, %eax 719# else 720 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax 721 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx 722 TOLOWER_gpr (%rax, %eax) 723 TOLOWER_gpr (%rcx, %ecx) 724 subl %ecx, %eax 725 xorl %r8d, %eax 726 subl %r8d, %eax 727# endif 728L(ret11): 729 VZEROUPPER_RETURN 730 731 732 /* Page cross in rsi in next 4x VEC. */ 733 734 /* TODO: Improve logic here. */ 735 .p2align 4,, 10 736L(page_cross_during_loop): 737 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ 738 739 /* Optimistically rsi and rdi and both aligned inwhich case we 740 don't need any logic here. */ 741 cmpl $-(VEC_SIZE * 4), %eax 742 /* Don't adjust eax before jumping back to loop and we will 743 never hit page cross case again. */ 744 je L(loop_skip_page_cross_check) 745 746 /* Check if we can safely load a VEC. */ 747 cmpl $-(VEC_SIZE * 3), %eax 748 jle L(less_1x_vec_till_page_cross) 749 750 VMOVA (%rdi), %ymm0 751 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) 752 VPCMPEQ %ymm0, %ymmZERO, %ymm2 753 vpandn %ymm1, %ymm2, %ymm1 754 vpmovmskb %ymm1, %ecx 755 incl %ecx 756 jnz L(return_vec_0_end) 757 758 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ 759 cmpl $-(VEC_SIZE * 2), %eax 760 jg L(more_2x_vec_till_page_cross) 761 762 .p2align 4,, 4 763L(less_1x_vec_till_page_cross): 764 subl $-(VEC_SIZE * 4), %eax 765 /* Guranteed safe to read from rdi - VEC_SIZE here. The only 766 concerning case is first iteration if incoming s1 was near start 767 of a page and s2 near end. If s1 was near the start of the page 768 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe 769 to read back -VEC_SIZE. If rdi is truly at the start of a page 770 here, it means the previous page (rdi - VEC_SIZE) has already 771 been loaded earlier so must be valid. */ 772 VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 773 CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) 774 VPCMPEQ %ymm0, %ymmZERO, %ymm2 775 vpandn %ymm1, %ymm2, %ymm1 776 vpmovmskb %ymm1, %ecx 777 778 /* Mask of potentially valid bits. The lower bits can be out of 779 range comparisons (but safe regarding page crosses). */ 780 movl $-1, %r10d 781 shlxl %esi, %r10d, %r10d 782 notl %ecx 783 784# ifdef USE_AS_STRNCMP 785 cmpq %rax, %rdx 786 jbe L(return_page_cross_end_check) 787# endif 788 movl %eax, %OFFSET_REG 789 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax 790 791 andl %r10d, %ecx 792 jz L(loop_skip_page_cross_check) 793 794 .p2align 4,, 3 795L(return_page_cross_end): 796 tzcntl %ecx, %ecx 797 798# ifdef USE_AS_STRNCMP 799 leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx 800L(return_page_cross_cmp_mem): 801# else 802 addl %OFFSET_REG, %ecx 803# endif 804# ifdef USE_AS_WCSCMP 805 movl VEC_OFFSET(%rdi, %rcx), %edx 806 xorl %eax, %eax 807 cmpl VEC_OFFSET(%rsi, %rcx), %edx 808 je L(ret8) 809 setl %al 810 negl %eax 811 xorl %r8d, %eax 812# else 813 movzbl VEC_OFFSET(%rdi, %rcx), %eax 814 movzbl VEC_OFFSET(%rsi, %rcx), %ecx 815 TOLOWER_gpr (%rax, %eax) 816 TOLOWER_gpr (%rcx, %ecx) 817 subl %ecx, %eax 818 xorl %r8d, %eax 819 subl %r8d, %eax 820# endif 821L(ret8): 822 VZEROUPPER_RETURN 823 824# ifdef USE_AS_STRNCMP 825 .p2align 4,, 10 826L(return_page_cross_end_check): 827 andl %r10d, %ecx 828 tzcntl %ecx, %ecx 829 leal -VEC_SIZE(%rax, %rcx), %ecx 830 cmpl %ecx, %edx 831 ja L(return_page_cross_cmp_mem) 832 xorl %eax, %eax 833 VZEROUPPER_RETURN 834# endif 835 836 837 .p2align 4,, 10 838L(more_2x_vec_till_page_cross): 839 /* If more 2x vec till cross we will complete a full loop 840 iteration here. */ 841 842 VMOVU VEC_SIZE(%rdi), %ymm0 843 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) 844 VPCMPEQ %ymm0, %ymmZERO, %ymm2 845 vpandn %ymm1, %ymm2, %ymm1 846 vpmovmskb %ymm1, %ecx 847 incl %ecx 848 jnz L(return_vec_1_end) 849 850# ifdef USE_AS_STRNCMP 851 cmpq $(VEC_SIZE * 2), %rdx 852 jbe L(ret_zero_in_loop_page_cross) 853# endif 854 855 subl $-(VEC_SIZE * 4), %eax 856 857 /* Safe to include comparisons from lower bytes. */ 858 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 859 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) 860 VPCMPEQ %ymm0, %ymmZERO, %ymm2 861 vpandn %ymm1, %ymm2, %ymm1 862 vpmovmskb %ymm1, %ecx 863 incl %ecx 864 jnz L(return_vec_page_cross_0) 865 866 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 867 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) 868 VPCMPEQ %ymm0, %ymmZERO, %ymm2 869 vpandn %ymm1, %ymm2, %ymm1 870 vpmovmskb %ymm1, %ecx 871 incl %ecx 872 jnz L(return_vec_page_cross_1) 873 874# ifdef USE_AS_STRNCMP 875 /* Must check length here as length might proclude reading next 876 page. */ 877 cmpq %rax, %rdx 878 jbe L(ret_zero_in_loop_page_cross) 879# endif 880 881 /* Finish the loop. */ 882 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 883 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 884 885 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) 886 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) 887 vpand %ymm4, %ymm5, %ymm5 888 vpand %ymm6, %ymm7, %ymm7 889 VPMINU %ymm5, %ymm7, %ymm7 890 VPCMPEQ %ymm7, %ymmZERO, %ymm7 891 vpmovmskb %ymm7, %LOOP_REG 892 testl %LOOP_REG, %LOOP_REG 893 jnz L(return_vec_2_3_end) 894 895 /* Best for code size to include ucond-jmp here. Would be faster 896 if this case is hot to duplicate the L(return_vec_2_3_end) code 897 as fall-through and have jump back to loop on mismatch 898 comparison. */ 899 subq $-(VEC_SIZE * 4), %rdi 900 subq $-(VEC_SIZE * 4), %rsi 901 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax 902# ifdef USE_AS_STRNCMP 903 subq $(VEC_SIZE * 4), %rdx 904 ja L(loop_skip_page_cross_check) 905L(ret_zero_in_loop_page_cross): 906 xorl %eax, %eax 907 VZEROUPPER_RETURN 908# else 909 jmp L(loop_skip_page_cross_check) 910# endif 911 912 913 .p2align 4,, 10 914L(return_vec_page_cross_0): 915 addl $-VEC_SIZE, %eax 916L(return_vec_page_cross_1): 917 tzcntl %ecx, %ecx 918# ifdef USE_AS_STRNCMP 919 leal -VEC_SIZE(%rax, %rcx), %ecx 920 cmpq %rcx, %rdx 921 jbe L(ret_zero_in_loop_page_cross) 922# else 923 addl %eax, %ecx 924# endif 925 926# ifdef USE_AS_WCSCMP 927 movl VEC_OFFSET(%rdi, %rcx), %edx 928 xorl %eax, %eax 929 cmpl VEC_OFFSET(%rsi, %rcx), %edx 930 je L(ret9) 931 setl %al 932 negl %eax 933 xorl %r8d, %eax 934# else 935 movzbl VEC_OFFSET(%rdi, %rcx), %eax 936 movzbl VEC_OFFSET(%rsi, %rcx), %ecx 937 TOLOWER_gpr (%rax, %eax) 938 TOLOWER_gpr (%rcx, %ecx) 939 subl %ecx, %eax 940 xorl %r8d, %eax 941 subl %r8d, %eax 942# endif 943L(ret9): 944 VZEROUPPER_RETURN 945 946 947 .p2align 4,, 10 948L(page_cross): 949# ifndef USE_AS_STRNCMP 950 /* If both are VEC aligned we don't need any special logic here. 951 Only valid for strcmp where stop condition is guranteed to be 952 reachable by just reading memory. */ 953 testl $((VEC_SIZE - 1) << 20), %eax 954 jz L(no_page_cross) 955# endif 956 957 movl %edi, %eax 958 movl %esi, %ecx 959 andl $(PAGE_SIZE - 1), %eax 960 andl $(PAGE_SIZE - 1), %ecx 961 962 xorl %OFFSET_REG, %OFFSET_REG 963 964 /* Check which is closer to page cross, s1 or s2. */ 965 cmpl %eax, %ecx 966 jg L(page_cross_s2) 967 968 /* The previous page cross check has false positives. Check for 969 true positive as page cross logic is very expensive. */ 970 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax 971 jbe L(no_page_cross) 972 973 /* Set r8 to not interfere with normal return value (rdi and rsi 974 did not swap). */ 975# ifdef USE_AS_WCSCMP 976 /* any non-zero positive value that doesn't inference with 0x1. 977 */ 978 movl $2, %r8d 979# else 980 xorl %r8d, %r8d 981# endif 982 983 /* Check if less than 1x VEC till page cross. */ 984 subl $(VEC_SIZE * 3), %eax 985 jg L(less_1x_vec_till_page) 986 987 /* If more than 1x VEC till page cross, loop throuh safely 988 loadable memory until within 1x VEC of page cross. */ 989 990 .p2align 4,, 10 991L(page_cross_loop): 992 993 VMOVU (%rdi, %OFFSET_REG64), %ymm0 994 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) 995 VPCMPEQ %ymm0, %ymmZERO, %ymm2 996 vpandn %ymm1, %ymm2, %ymm1 997 vpmovmskb %ymm1, %ecx 998 incl %ecx 999 1000 jnz L(check_ret_vec_page_cross) 1001 addl $VEC_SIZE, %OFFSET_REG 1002# ifdef USE_AS_STRNCMP 1003 cmpq %OFFSET_REG64, %rdx 1004 jbe L(ret_zero_page_cross) 1005# endif 1006 addl $VEC_SIZE, %eax 1007 jl L(page_cross_loop) 1008 1009 subl %eax, %OFFSET_REG 1010 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed 1011 to not cross page so is safe to load. Since we have already 1012 loaded at least 1 VEC from rsi it is also guranteed to be 1013 safe. */ 1014 1015 VMOVU (%rdi, %OFFSET_REG64), %ymm0 1016 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) 1017 VPCMPEQ %ymm0, %ymmZERO, %ymm2 1018 vpandn %ymm1, %ymm2, %ymm1 1019 vpmovmskb %ymm1, %ecx 1020 1021# ifdef USE_AS_STRNCMP 1022 leal VEC_SIZE(%OFFSET_REG64), %eax 1023 cmpq %rax, %rdx 1024 jbe L(check_ret_vec_page_cross2) 1025 addq %rdi, %rdx 1026# endif 1027 incl %ecx 1028 jz L(prepare_loop_no_len) 1029 1030 .p2align 4,, 4 1031L(ret_vec_page_cross): 1032# ifndef USE_AS_STRNCMP 1033L(check_ret_vec_page_cross): 1034# endif 1035 tzcntl %ecx, %ecx 1036 addl %OFFSET_REG, %ecx 1037L(ret_vec_page_cross_cont): 1038# ifdef USE_AS_WCSCMP 1039 movl (%rdi, %rcx), %edx 1040 xorl %eax, %eax 1041 cmpl (%rsi, %rcx), %edx 1042 je L(ret12) 1043 setl %al 1044 negl %eax 1045 xorl %r8d, %eax 1046# else 1047 movzbl (%rdi, %rcx), %eax 1048 movzbl (%rsi, %rcx), %ecx 1049 TOLOWER_gpr (%rax, %eax) 1050 TOLOWER_gpr (%rcx, %ecx) 1051 subl %ecx, %eax 1052 xorl %r8d, %eax 1053 subl %r8d, %eax 1054# endif 1055L(ret12): 1056 VZEROUPPER_RETURN 1057 1058# ifdef USE_AS_STRNCMP 1059 .p2align 4,, 10 1060L(check_ret_vec_page_cross2): 1061 incl %ecx 1062L(check_ret_vec_page_cross): 1063 tzcntl %ecx, %ecx 1064 addl %OFFSET_REG, %ecx 1065 cmpq %rcx, %rdx 1066 ja L(ret_vec_page_cross_cont) 1067 .p2align 4,, 2 1068L(ret_zero_page_cross): 1069 xorl %eax, %eax 1070 VZEROUPPER_RETURN 1071# endif 1072 1073 .p2align 4,, 4 1074L(page_cross_s2): 1075 /* Ensure this is a true page cross. */ 1076 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx 1077 jbe L(no_page_cross) 1078 1079 1080 movl %ecx, %eax 1081 movq %rdi, %rcx 1082 movq %rsi, %rdi 1083 movq %rcx, %rsi 1084 1085 /* set r8 to negate return value as rdi and rsi swapped. */ 1086# ifdef USE_AS_WCSCMP 1087 movl $-4, %r8d 1088# else 1089 movl $-1, %r8d 1090# endif 1091 xorl %OFFSET_REG, %OFFSET_REG 1092 1093 /* Check if more than 1x VEC till page cross. */ 1094 subl $(VEC_SIZE * 3), %eax 1095 jle L(page_cross_loop) 1096 1097 .p2align 4,, 6 1098L(less_1x_vec_till_page): 1099 /* Find largest load size we can use. */ 1100 cmpl $16, %eax 1101 ja L(less_16_till_page) 1102 1103 VMOVU (%rdi), %xmm0 1104 CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) 1105 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1106 vpandn %xmm1, %xmm2, %xmm1 1107 vpmovmskb %ymm1, %ecx 1108 incw %cx 1109 jnz L(check_ret_vec_page_cross) 1110 movl $16, %OFFSET_REG 1111# ifdef USE_AS_STRNCMP 1112 cmpq %OFFSET_REG64, %rdx 1113 jbe L(ret_zero_page_cross_slow_case0) 1114 subl %eax, %OFFSET_REG 1115# else 1116 /* Explicit check for 16 byte alignment. */ 1117 subl %eax, %OFFSET_REG 1118 jz L(prepare_loop) 1119# endif 1120 1121 VMOVU (%rdi, %OFFSET_REG64), %xmm0 1122 CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) 1123 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1124 vpandn %xmm1, %xmm2, %xmm1 1125 vpmovmskb %ymm1, %ecx 1126 incw %cx 1127 jnz L(check_ret_vec_page_cross) 1128 1129# ifdef USE_AS_STRNCMP 1130 addl $16, %OFFSET_REG 1131 subq %OFFSET_REG64, %rdx 1132 jbe L(ret_zero_page_cross_slow_case0) 1133 subq $-(VEC_SIZE * 4), %rdx 1134 1135 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1136 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1137# else 1138 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1139 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1140# endif 1141 jmp L(prepare_loop_aligned) 1142 1143# ifdef USE_AS_STRNCMP 1144 .p2align 4,, 2 1145L(ret_zero_page_cross_slow_case0): 1146 xorl %eax, %eax 1147 ret 1148# endif 1149 1150 1151 .p2align 4,, 10 1152L(less_16_till_page): 1153 /* Find largest load size we can use. */ 1154 cmpl $24, %eax 1155 ja L(less_8_till_page) 1156 1157 vmovq (%rdi), %xmm0 1158 vmovq (%rsi), %xmm1 1159 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1160 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) 1161 vpandn %xmm1, %xmm2, %xmm1 1162 vpmovmskb %ymm1, %ecx 1163 incb %cl 1164 jnz L(check_ret_vec_page_cross) 1165 1166 1167# ifdef USE_AS_STRNCMP 1168 cmpq $8, %rdx 1169 jbe L(ret_zero_page_cross_slow_case0) 1170# endif 1171 movl $24, %OFFSET_REG 1172 /* Explicit check for 16 byte alignment. */ 1173 subl %eax, %OFFSET_REG 1174 1175 1176 1177 vmovq (%rdi, %OFFSET_REG64), %xmm0 1178 vmovq (%rsi, %OFFSET_REG64), %xmm1 1179 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1180 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) 1181 vpandn %xmm1, %xmm2, %xmm1 1182 vpmovmskb %ymm1, %ecx 1183 incb %cl 1184 jnz L(check_ret_vec_page_cross) 1185 1186# ifdef USE_AS_STRNCMP 1187 addl $8, %OFFSET_REG 1188 subq %OFFSET_REG64, %rdx 1189 jbe L(ret_zero_page_cross_slow_case0) 1190 subq $-(VEC_SIZE * 4), %rdx 1191 1192 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1193 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1194# else 1195 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1196 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1197# endif 1198 jmp L(prepare_loop_aligned) 1199 1200 1201 .p2align 4,, 10 1202L(less_8_till_page): 1203# ifdef USE_AS_WCSCMP 1204 /* If using wchar then this is the only check before we reach 1205 the page boundary. */ 1206 movl (%rdi), %eax 1207 movl (%rsi), %ecx 1208 cmpl %ecx, %eax 1209 jnz L(ret_less_8_wcs) 1210# ifdef USE_AS_STRNCMP 1211 addq %rdi, %rdx 1212 /* We already checked for len <= 1 so cannot hit that case here. 1213 */ 1214# endif 1215 testl %eax, %eax 1216 jnz L(prepare_loop_no_len) 1217 ret 1218 1219 .p2align 4,, 8 1220L(ret_less_8_wcs): 1221 setl %OFFSET_REG8 1222 negl %OFFSET_REG 1223 movl %OFFSET_REG, %eax 1224 xorl %r8d, %eax 1225 ret 1226 1227# else 1228 1229 /* Find largest load size we can use. */ 1230 cmpl $28, %eax 1231 ja L(less_4_till_page) 1232 1233 vmovd (%rdi), %xmm0 1234 vmovd (%rsi), %xmm1 1235 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1236 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) 1237 vpandn %xmm1, %xmm2, %xmm1 1238 vpmovmskb %ymm1, %ecx 1239 subl $0xf, %ecx 1240 jnz L(check_ret_vec_page_cross) 1241 1242# ifdef USE_AS_STRNCMP 1243 cmpq $4, %rdx 1244 jbe L(ret_zero_page_cross_slow_case1) 1245# endif 1246 movl $28, %OFFSET_REG 1247 /* Explicit check for 16 byte alignment. */ 1248 subl %eax, %OFFSET_REG 1249 1250 1251 1252 vmovd (%rdi, %OFFSET_REG64), %xmm0 1253 vmovd (%rsi, %OFFSET_REG64), %xmm1 1254 VPCMPEQ %xmm0, %xmmZERO, %xmm2 1255 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) 1256 vpandn %xmm1, %xmm2, %xmm1 1257 vpmovmskb %ymm1, %ecx 1258 subl $0xf, %ecx 1259 jnz L(check_ret_vec_page_cross) 1260 1261# ifdef USE_AS_STRNCMP 1262 addl $4, %OFFSET_REG 1263 subq %OFFSET_REG64, %rdx 1264 jbe L(ret_zero_page_cross_slow_case1) 1265 subq $-(VEC_SIZE * 4), %rdx 1266 1267 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1268 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1269# else 1270 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi 1271 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi 1272# endif 1273 jmp L(prepare_loop_aligned) 1274 1275# ifdef USE_AS_STRNCMP 1276 .p2align 4,, 2 1277L(ret_zero_page_cross_slow_case1): 1278 xorl %eax, %eax 1279 ret 1280# endif 1281 1282 .p2align 4,, 10 1283L(less_4_till_page): 1284 subq %rdi, %rsi 1285 /* Extremely slow byte comparison loop. */ 1286L(less_4_loop): 1287 movzbl (%rdi), %eax 1288 movzbl (%rsi, %rdi), %ecx 1289 TOLOWER_gpr (%rax, %eax) 1290 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) 1291 subl %BYTE_LOOP_REG, %eax 1292 jnz L(ret_less_4_loop) 1293 testl %ecx, %ecx 1294 jz L(ret_zero_4_loop) 1295# ifdef USE_AS_STRNCMP 1296 decq %rdx 1297 jz L(ret_zero_4_loop) 1298# endif 1299 incq %rdi 1300 /* end condition is reach page boundary (rdi is aligned). */ 1301 testl $31, %edi 1302 jnz L(less_4_loop) 1303 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi 1304 addq $-(VEC_SIZE * 4), %rdi 1305# ifdef USE_AS_STRNCMP 1306 subq $-(VEC_SIZE * 4), %rdx 1307# endif 1308 jmp L(prepare_loop_aligned) 1309 1310L(ret_zero_4_loop): 1311 xorl %eax, %eax 1312 ret 1313L(ret_less_4_loop): 1314 xorl %r8d, %eax 1315 subl %r8d, %eax 1316 ret 1317# endif 1318 cfi_endproc 1319 .size STRCMP, .-STRCMP 1320#endif 1321