1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23# define STRCMP_ISA _evex 24# include "strcmp-naming.h" 25 26# include <sysdep.h> 27# if defined USE_AS_STRCASECMP_L 28# include "locale-defines.h" 29# endif 30 31# ifndef STRCMP 32# define STRCMP __strcmp_evex 33# endif 34 35# define PAGE_SIZE 4096 36 37 /* VEC_SIZE = Number of bytes in a ymm register. */ 38# define VEC_SIZE 32 39# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) 40 41# define VMOVU vmovdqu64 42# define VMOVA vmovdqa64 43 44# ifdef USE_AS_WCSCMP 45# define TESTEQ subl $0xff, 46 /* Compare packed dwords. */ 47# define VPCMP vpcmpd 48# define VPMINU vpminud 49# define VPTESTM vptestmd 50# define VPTESTNM vptestnmd 51 /* 1 dword char == 4 bytes. */ 52# define SIZE_OF_CHAR 4 53# else 54# define TESTEQ incl 55 /* Compare packed bytes. */ 56# define VPCMP vpcmpb 57# define VPMINU vpminub 58# define VPTESTM vptestmb 59# define VPTESTNM vptestnmb 60 /* 1 byte char == 1 byte. */ 61# define SIZE_OF_CHAR 1 62# endif 63 64# ifdef USE_AS_STRNCMP 65# define LOOP_REG r9d 66# define LOOP_REG64 r9 67 68# define OFFSET_REG8 r9b 69# define OFFSET_REG r9d 70# define OFFSET_REG64 r9 71# else 72# define LOOP_REG edx 73# define LOOP_REG64 rdx 74 75# define OFFSET_REG8 dl 76# define OFFSET_REG edx 77# define OFFSET_REG64 rdx 78# endif 79 80# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP 81# define VEC_OFFSET 0 82# else 83# define VEC_OFFSET (-VEC_SIZE) 84# endif 85 86# define XMM0 xmm17 87# define XMM1 xmm18 88 89# define XMM10 xmm27 90# define XMM11 xmm28 91# define XMM12 xmm29 92# define XMM13 xmm30 93# define XMM14 xmm31 94 95 96# define YMM0 ymm17 97# define YMM1 ymm18 98# define YMM2 ymm19 99# define YMM3 ymm20 100# define YMM4 ymm21 101# define YMM5 ymm22 102# define YMM6 ymm23 103# define YMM7 ymm24 104# define YMM8 ymm25 105# define YMM9 ymm26 106# define YMM10 ymm27 107# define YMM11 ymm28 108# define YMM12 ymm29 109# define YMM13 ymm30 110# define YMM14 ymm31 111 112# ifdef USE_AS_STRCASECMP_L 113# define BYTE_LOOP_REG OFFSET_REG 114# else 115# define BYTE_LOOP_REG ecx 116# endif 117 118# ifdef USE_AS_STRCASECMP_L 119# ifdef USE_AS_STRNCMP 120# define LOCALE_REG rcx 121# define LOCALE_REG_LP RCX_LP 122# else 123# define LOCALE_REG rdx 124# define LOCALE_REG_LP RDX_LP 125# endif 126# endif 127 128# define LCASE_MIN_YMM %YMM12 129# define LCASE_MAX_YMM %YMM13 130# define CASE_ADD_YMM %YMM14 131 132# define LCASE_MIN_XMM %XMM12 133# define LCASE_MAX_XMM %XMM13 134# define CASE_ADD_XMM %XMM14 135 136 /* NB: wcsncmp uses r11 but strcasecmp is never used in 137 conjunction with wcscmp. */ 138# define TOLOWER_BASE %r11 139 140# ifdef USE_AS_STRCASECMP_L 141# define _REG(x, y) x ## y 142# define REG(x, y) _REG(x, y) 143# define TOLOWER(reg1, reg2, ext) \ 144 vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ 145 vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ 146 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ 147 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ 148 vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ 149 vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} 150 151# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst 152# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) 153# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) 154 155# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ 156 TOLOWER (s1_reg, s2_reg, ext); \ 157 VPCMP $0, s1_reg, s2_reg, reg_out 158 159# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ 160 VMOVU s2_mem, s2_reg; \ 161 CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) 162 163# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) 164# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) 165 166# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) 167# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) 168 169# else 170# define TOLOWER_gpr(...) 171# define TOLOWER_YMM(...) 172# define TOLOWER_XMM(...) 173 174# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ 175 VPCMP $0, s2_reg, s1_reg, reg_out 176 177# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) 178 179# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ 180 VPCMP $0, s2_mem, s1_reg, reg_out 181 182# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) 183# endif 184 185/* Warning! 186 wcscmp/wcsncmp have to use SIGNED comparison for elements. 187 strcmp/strncmp have to use UNSIGNED comparison for elements. 188*/ 189 190/* The main idea of the string comparison (byte or dword) using 256-bit 191 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The 192 latter can be on either packed bytes or dwords depending on 193 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the 194 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 195 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) 196 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd 197 instructions. Main loop (away from from page boundary) compares 4 198 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 199 bytes) on each loop. 200 201 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic 202 is the same as strcmp, except that an a maximum offset is tracked. If 203 the maximum offset is reached before a difference is found, zero is 204 returned. */ 205 206 .section .text.evex, "ax", @progbits 207 .align 16 208 .type STRCMP, @function 209 .globl STRCMP 210# ifdef USE_AS_STRCASECMP_L 211ENTRY (STRCASECMP) 212 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax 213 mov %fs:(%rax), %LOCALE_REG_LP 214 215 /* Either 1 or 5 bytes (dependeing if CET is enabled). */ 216 .p2align 4 217END (STRCASECMP) 218 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ 219# endif 220 221 .p2align 4 222STRCMP: 223 cfi_startproc 224 _CET_ENDBR 225 CALL_MCOUNT 226 227# if defined USE_AS_STRCASECMP_L 228 /* We have to fall back on the C implementation for locales with 229 encodings not matching ASCII for single bytes. */ 230# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 231 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP 232# else 233 mov (%LOCALE_REG), %RAX_LP 234# endif 235 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) 236 jne STRCASECMP_L_NONASCII 237 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE 238# endif 239 240# ifdef USE_AS_STRNCMP 241 /* Don't overwrite LOCALE_REG (rcx) until we have pass 242 L(one_or_less). Otherwise we might use the wrong locale in 243 the OVERFLOW_STRCMP (strcasecmp_l). */ 244# ifdef __ILP32__ 245 /* Clear the upper 32 bits. */ 246 movl %edx, %edx 247# endif 248 cmp $1, %RDX_LP 249 /* Signed comparison intentional. We use this branch to also 250 test cases where length >= 2^63. These very large sizes can be 251 handled with strcmp as there is no way for that length to 252 actually bound the buffer. */ 253 jle L(one_or_less) 254# endif 255 256# if defined USE_AS_STRCASECMP_L 257 .section .rodata.cst32, "aM", @progbits, 32 258 .align 32 259L(lcase_min): 260 .quad 0x4141414141414141 261 .quad 0x4141414141414141 262 .quad 0x4141414141414141 263 .quad 0x4141414141414141 264L(lcase_max): 265 .quad 0x1a1a1a1a1a1a1a1a 266 .quad 0x1a1a1a1a1a1a1a1a 267 .quad 0x1a1a1a1a1a1a1a1a 268 .quad 0x1a1a1a1a1a1a1a1a 269L(case_add): 270 .quad 0x2020202020202020 271 .quad 0x2020202020202020 272 .quad 0x2020202020202020 273 .quad 0x2020202020202020 274 .previous 275 276 vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM 277 vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM 278 vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM 279# endif 280 281 movl %edi, %eax 282 orl %esi, %eax 283 /* Shift out the bits irrelivant to page boundary ([63:12]). */ 284 sall $20, %eax 285 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ 286 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax 287 ja L(page_cross) 288 289L(no_page_cross): 290 /* Safe to compare 4x vectors. */ 291 VMOVU (%rdi), %YMM0 292 VPTESTM %YMM0, %YMM0, %k2 293 /* Each bit cleared in K1 represents a mismatch or a null CHAR 294 in YMM0 and 32 bytes at (%rsi). */ 295 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} 296 kmovd %k1, %ecx 297# ifdef USE_AS_STRNCMP 298 cmpq $CHAR_PER_VEC, %rdx 299 jbe L(vec_0_test_len) 300# endif 301 302 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for 303 wcscmp/wcsncmp. */ 304 305 /* All 1s represents all equals. TESTEQ will overflow to zero in 306 all equals case. Otherwise 1s will carry until position of first 307 mismatch. */ 308 TESTEQ %ecx 309 jz L(more_3x_vec) 310 311 .p2align 4,, 4 312L(return_vec_0): 313 tzcntl %ecx, %ecx 314# ifdef USE_AS_WCSCMP 315 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx 316 xorl %eax, %eax 317 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx 318 je L(ret0) 319 setl %al 320 negl %eax 321 orl $1, %eax 322# else 323 movzbl (%rdi, %rcx), %eax 324 movzbl (%rsi, %rcx), %ecx 325 TOLOWER_gpr (%rax, %eax) 326 TOLOWER_gpr (%rcx, %ecx) 327 subl %ecx, %eax 328# endif 329L(ret0): 330 ret 331 332# ifdef USE_AS_STRNCMP 333 .p2align 4,, 4 334L(vec_0_test_len): 335 notl %ecx 336 bzhil %edx, %ecx, %eax 337 jnz L(return_vec_0) 338 /* Align if will cross fetch block. */ 339 .p2align 4,, 2 340L(ret_zero): 341 xorl %eax, %eax 342 ret 343 344 .p2align 4,, 5 345L(one_or_less): 346# ifdef USE_AS_STRCASECMP_L 347 /* Set locale argument for strcasecmp. */ 348 movq %LOCALE_REG, %rdx 349# endif 350 jb L(ret_zero) 351 /* 'nbe' covers the case where length is negative (large 352 unsigned). */ 353 jnbe OVERFLOW_STRCMP 354# ifdef USE_AS_WCSCMP 355 movl (%rdi), %edx 356 xorl %eax, %eax 357 cmpl (%rsi), %edx 358 je L(ret1) 359 setl %al 360 negl %eax 361 orl $1, %eax 362# else 363 movzbl (%rdi), %eax 364 movzbl (%rsi), %ecx 365 TOLOWER_gpr (%rax, %eax) 366 TOLOWER_gpr (%rcx, %ecx) 367 subl %ecx, %eax 368# endif 369L(ret1): 370 ret 371# endif 372 373 .p2align 4,, 10 374L(return_vec_1): 375 tzcntl %ecx, %ecx 376# ifdef USE_AS_STRNCMP 377 /* rdx must be > CHAR_PER_VEC so its safe to subtract without 378 worrying about underflow. */ 379 addq $-CHAR_PER_VEC, %rdx 380 cmpq %rcx, %rdx 381 jbe L(ret_zero) 382# endif 383# ifdef USE_AS_WCSCMP 384 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx 385 xorl %eax, %eax 386 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx 387 je L(ret2) 388 setl %al 389 negl %eax 390 orl $1, %eax 391# else 392 movzbl VEC_SIZE(%rdi, %rcx), %eax 393 movzbl VEC_SIZE(%rsi, %rcx), %ecx 394 TOLOWER_gpr (%rax, %eax) 395 TOLOWER_gpr (%rcx, %ecx) 396 subl %ecx, %eax 397# endif 398L(ret2): 399 ret 400 401 .p2align 4,, 10 402# ifdef USE_AS_STRNCMP 403L(return_vec_3): 404# if CHAR_PER_VEC <= 16 405 sall $CHAR_PER_VEC, %ecx 406# else 407 salq $CHAR_PER_VEC, %rcx 408# endif 409# endif 410L(return_vec_2): 411# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) 412 tzcntl %ecx, %ecx 413# else 414 tzcntq %rcx, %rcx 415# endif 416 417# ifdef USE_AS_STRNCMP 418 cmpq %rcx, %rdx 419 jbe L(ret_zero) 420# endif 421 422# ifdef USE_AS_WCSCMP 423 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx 424 xorl %eax, %eax 425 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx 426 je L(ret3) 427 setl %al 428 negl %eax 429 orl $1, %eax 430# else 431 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax 432 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx 433 TOLOWER_gpr (%rax, %eax) 434 TOLOWER_gpr (%rcx, %ecx) 435 subl %ecx, %eax 436# endif 437L(ret3): 438 ret 439 440# ifndef USE_AS_STRNCMP 441 .p2align 4,, 10 442L(return_vec_3): 443 tzcntl %ecx, %ecx 444# ifdef USE_AS_WCSCMP 445 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx 446 xorl %eax, %eax 447 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx 448 je L(ret4) 449 setl %al 450 negl %eax 451 orl $1, %eax 452# else 453 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 454 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 455 TOLOWER_gpr (%rax, %eax) 456 TOLOWER_gpr (%rcx, %ecx) 457 subl %ecx, %eax 458# endif 459L(ret4): 460 ret 461# endif 462 463 /* 32 byte align here ensures the main loop is ideally aligned 464 for DSB. */ 465 .p2align 5 466L(more_3x_vec): 467 /* Safe to compare 4x vectors. */ 468 VMOVU (VEC_SIZE)(%rdi), %YMM0 469 VPTESTM %YMM0, %YMM0, %k2 470 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} 471 kmovd %k1, %ecx 472 TESTEQ %ecx 473 jnz L(return_vec_1) 474 475# ifdef USE_AS_STRNCMP 476 subq $(CHAR_PER_VEC * 2), %rdx 477 jbe L(ret_zero) 478# endif 479 480 VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 481 VPTESTM %YMM0, %YMM0, %k2 482 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} 483 kmovd %k1, %ecx 484 TESTEQ %ecx 485 jnz L(return_vec_2) 486 487 VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 488 VPTESTM %YMM0, %YMM0, %k2 489 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} 490 kmovd %k1, %ecx 491 TESTEQ %ecx 492 jnz L(return_vec_3) 493 494# ifdef USE_AS_STRNCMP 495 cmpq $(CHAR_PER_VEC * 2), %rdx 496 jbe L(ret_zero) 497# endif 498 499 500# ifdef USE_AS_WCSCMP 501 /* any non-zero positive value that doesn't inference with 0x1. 502 */ 503 movl $2, %r8d 504 505# else 506 xorl %r8d, %r8d 507# endif 508 509 /* The prepare labels are various entry points from the page 510 cross logic. */ 511L(prepare_loop): 512 513# ifdef USE_AS_STRNCMP 514# ifdef USE_AS_WCSCMP 515L(prepare_loop_no_len): 516 movl %edi, %ecx 517 andl $(VEC_SIZE * 4 - 1), %ecx 518 shrl $2, %ecx 519 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx 520# else 521 /* Store N + (VEC_SIZE * 4) and place check at the begining of 522 the loop. */ 523 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx 524L(prepare_loop_no_len): 525# endif 526# else 527L(prepare_loop_no_len): 528# endif 529 530 /* Align s1 and adjust s2 accordingly. */ 531 subq %rdi, %rsi 532 andq $-(VEC_SIZE * 4), %rdi 533L(prepare_loop_readj): 534 addq %rdi, %rsi 535# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) 536 subq %rdi, %rdx 537# endif 538 539L(prepare_loop_aligned): 540 /* eax stores distance from rsi to next page cross. These cases 541 need to be handled specially as the 4x loop could potentially 542 read memory past the length of s1 or s2 and across a page 543 boundary. */ 544 movl $-(VEC_SIZE * 4), %eax 545 subl %esi, %eax 546 andl $(PAGE_SIZE - 1), %eax 547 548 549 /* Loop 4x comparisons at a time. */ 550 .p2align 4 551L(loop): 552 553 /* End condition for strncmp. */ 554# ifdef USE_AS_STRNCMP 555 subq $(CHAR_PER_VEC * 4), %rdx 556 jbe L(ret_zero) 557# endif 558 559 subq $-(VEC_SIZE * 4), %rdi 560 subq $-(VEC_SIZE * 4), %rsi 561 562 /* Check if rsi loads will cross a page boundary. */ 563 addl $-(VEC_SIZE * 4), %eax 564 jnb L(page_cross_during_loop) 565 566 /* Loop entry after handling page cross during loop. */ 567L(loop_skip_page_cross_check): 568 VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 569 VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 570 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 571 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 572 573 VPMINU %YMM0, %YMM2, %YMM8 574 VPMINU %YMM4, %YMM6, %YMM9 575 576 /* A zero CHAR in YMM9 means that there is a null CHAR. */ 577 VPMINU %YMM8, %YMM9, %YMM9 578 579 /* Each bit set in K1 represents a non-null CHAR in YMM9. */ 580 VPTESTM %YMM9, %YMM9, %k1 581# ifndef USE_AS_STRCASECMP_L 582 vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 583 vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 584 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 585 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while 586 oring with YMM1. Result is stored in YMM6. */ 587 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 588# else 589 VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 590 TOLOWER_YMM (%YMM0, %YMM1) 591 VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 592 TOLOWER_YMM (%YMM2, %YMM3) 593 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 594 TOLOWER_YMM (%YMM4, %YMM5) 595 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 596 TOLOWER_YMM (%YMM6, %YMM7) 597 vpxorq %YMM0, %YMM1, %YMM1 598 vpxorq %YMM2, %YMM3, %YMM3 599 vpxorq %YMM4, %YMM5, %YMM5 600 vpternlogd $0xde, %YMM7, %YMM1, %YMM6 601# endif 602 /* Or together YMM3, YMM5, and YMM6. */ 603 vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 604 605 606 /* A non-zero CHAR in YMM6 represents a mismatch. */ 607 VPTESTNM %YMM6, %YMM6, %k0{%k1} 608 kmovd %k0, %LOOP_REG 609 610 TESTEQ %LOOP_REG 611 jz L(loop) 612 613 614 /* Find which VEC has the mismatch of end of string. */ 615 VPTESTM %YMM0, %YMM0, %k1 616 VPTESTNM %YMM1, %YMM1, %k0{%k1} 617 kmovd %k0, %ecx 618 TESTEQ %ecx 619 jnz L(return_vec_0_end) 620 621 VPTESTM %YMM2, %YMM2, %k1 622 VPTESTNM %YMM3, %YMM3, %k0{%k1} 623 kmovd %k0, %ecx 624 TESTEQ %ecx 625 jnz L(return_vec_1_end) 626 627 628 /* Handle VEC 2 and 3 without branches. */ 629L(return_vec_2_3_end): 630# ifdef USE_AS_STRNCMP 631 subq $(CHAR_PER_VEC * 2), %rdx 632 jbe L(ret_zero_end) 633# endif 634 635 VPTESTM %YMM4, %YMM4, %k1 636 VPTESTNM %YMM5, %YMM5, %k0{%k1} 637 kmovd %k0, %ecx 638 TESTEQ %ecx 639# if CHAR_PER_VEC <= 16 640 sall $CHAR_PER_VEC, %LOOP_REG 641 orl %ecx, %LOOP_REG 642# else 643 salq $CHAR_PER_VEC, %LOOP_REG64 644 orq %rcx, %LOOP_REG64 645# endif 646L(return_vec_3_end): 647 /* LOOP_REG contains matches for null/mismatch from the loop. If 648 VEC 0,1,and 2 all have no null and no mismatches then mismatch 649 must entirely be from VEC 3 which is fully represented by 650 LOOP_REG. */ 651# if CHAR_PER_VEC <= 16 652 tzcntl %LOOP_REG, %LOOP_REG 653# else 654 tzcntq %LOOP_REG64, %LOOP_REG64 655# endif 656# ifdef USE_AS_STRNCMP 657 cmpq %LOOP_REG64, %rdx 658 jbe L(ret_zero_end) 659# endif 660 661# ifdef USE_AS_WCSCMP 662 movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx 663 xorl %eax, %eax 664 cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx 665 je L(ret5) 666 setl %al 667 negl %eax 668 xorl %r8d, %eax 669# else 670 movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax 671 movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx 672 TOLOWER_gpr (%rax, %eax) 673 TOLOWER_gpr (%rcx, %ecx) 674 subl %ecx, %eax 675 xorl %r8d, %eax 676 subl %r8d, %eax 677# endif 678L(ret5): 679 ret 680 681# ifdef USE_AS_STRNCMP 682 .p2align 4,, 2 683L(ret_zero_end): 684 xorl %eax, %eax 685 ret 686# endif 687 688 689 /* The L(return_vec_N_end) differ from L(return_vec_N) in that 690 they use the value of `r8` to negate the return value. This is 691 because the page cross logic can swap `rdi` and `rsi`. */ 692 .p2align 4,, 10 693# ifdef USE_AS_STRNCMP 694L(return_vec_1_end): 695# if CHAR_PER_VEC <= 16 696 sall $CHAR_PER_VEC, %ecx 697# else 698 salq $CHAR_PER_VEC, %rcx 699# endif 700# endif 701L(return_vec_0_end): 702# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) 703 tzcntl %ecx, %ecx 704# else 705 tzcntq %rcx, %rcx 706# endif 707 708# ifdef USE_AS_STRNCMP 709 cmpq %rcx, %rdx 710 jbe L(ret_zero_end) 711# endif 712 713# ifdef USE_AS_WCSCMP 714 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx 715 xorl %eax, %eax 716 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx 717 je L(ret6) 718 setl %al 719 negl %eax 720 /* This is the non-zero case for `eax` so just xorl with `r8d` 721 flip is `rdi` and `rsi` where swapped. */ 722 xorl %r8d, %eax 723# else 724 movzbl (%rdi, %rcx), %eax 725 movzbl (%rsi, %rcx), %ecx 726 TOLOWER_gpr (%rax, %eax) 727 TOLOWER_gpr (%rcx, %ecx) 728 subl %ecx, %eax 729 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross 730 logic. Subtract `r8d` after xor for zero case. */ 731 xorl %r8d, %eax 732 subl %r8d, %eax 733# endif 734L(ret6): 735 ret 736 737# ifndef USE_AS_STRNCMP 738 .p2align 4,, 10 739L(return_vec_1_end): 740 tzcntl %ecx, %ecx 741# ifdef USE_AS_WCSCMP 742 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx 743 xorl %eax, %eax 744 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx 745 je L(ret7) 746 setl %al 747 negl %eax 748 xorl %r8d, %eax 749# else 750 movzbl VEC_SIZE(%rdi, %rcx), %eax 751 movzbl VEC_SIZE(%rsi, %rcx), %ecx 752 TOLOWER_gpr (%rax, %eax) 753 TOLOWER_gpr (%rcx, %ecx) 754 subl %ecx, %eax 755 xorl %r8d, %eax 756 subl %r8d, %eax 757# endif 758L(ret7): 759 ret 760# endif 761 762 763 /* Page cross in rsi in next 4x VEC. */ 764 765 /* TODO: Improve logic here. */ 766 .p2align 4,, 10 767L(page_cross_during_loop): 768 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ 769 770 /* Optimistically rsi and rdi and both aligned in which case we 771 don't need any logic here. */ 772 cmpl $-(VEC_SIZE * 4), %eax 773 /* Don't adjust eax before jumping back to loop and we will 774 never hit page cross case again. */ 775 je L(loop_skip_page_cross_check) 776 777 /* Check if we can safely load a VEC. */ 778 cmpl $-(VEC_SIZE * 3), %eax 779 jle L(less_1x_vec_till_page_cross) 780 781 VMOVA (%rdi), %YMM0 782 VPTESTM %YMM0, %YMM0, %k2 783 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} 784 kmovd %k1, %ecx 785 TESTEQ %ecx 786 jnz L(return_vec_0_end) 787 788 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ 789 cmpl $-(VEC_SIZE * 2), %eax 790 jg L(more_2x_vec_till_page_cross) 791 792 .p2align 4,, 4 793L(less_1x_vec_till_page_cross): 794 subl $-(VEC_SIZE * 4), %eax 795 /* Guranteed safe to read from rdi - VEC_SIZE here. The only 796 concerning case is first iteration if incoming s1 was near start 797 of a page and s2 near end. If s1 was near the start of the page 798 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe 799 to read back -VEC_SIZE. If rdi is truly at the start of a page 800 here, it means the previous page (rdi - VEC_SIZE) has already 801 been loaded earlier so must be valid. */ 802 VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 803 VPTESTM %YMM0, %YMM0, %k2 804 CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} 805 /* Mask of potentially valid bits. The lower bits can be out of 806 range comparisons (but safe regarding page crosses). */ 807 808# ifdef USE_AS_WCSCMP 809 movl $-1, %r10d 810 movl %esi, %ecx 811 andl $(VEC_SIZE - 1), %ecx 812 shrl $2, %ecx 813 shlxl %ecx, %r10d, %ecx 814 movzbl %cl, %r10d 815# else 816 movl $-1, %ecx 817 shlxl %esi, %ecx, %r10d 818# endif 819 820 kmovd %k1, %ecx 821 notl %ecx 822 823 824# ifdef USE_AS_STRNCMP 825# ifdef USE_AS_WCSCMP 826 /* NB: strcasecmp not used with WCSCMP so this access to r11 is 827 safe. */ 828 movl %eax, %r11d 829 shrl $2, %r11d 830 cmpq %r11, %rdx 831# else 832 cmpq %rax, %rdx 833# endif 834 jbe L(return_page_cross_end_check) 835# endif 836 movl %eax, %OFFSET_REG 837 838 /* Readjust eax before potentially returning to the loop. */ 839 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax 840 841 andl %r10d, %ecx 842 jz L(loop_skip_page_cross_check) 843 844 .p2align 4,, 3 845L(return_page_cross_end): 846 tzcntl %ecx, %ecx 847 848# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) 849 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx 850L(return_page_cross_cmp_mem): 851# else 852 addl %OFFSET_REG, %ecx 853# endif 854# ifdef USE_AS_WCSCMP 855 movl VEC_OFFSET(%rdi, %rcx), %edx 856 xorl %eax, %eax 857 cmpl VEC_OFFSET(%rsi, %rcx), %edx 858 je L(ret8) 859 setl %al 860 negl %eax 861 xorl %r8d, %eax 862# else 863 movzbl VEC_OFFSET(%rdi, %rcx), %eax 864 movzbl VEC_OFFSET(%rsi, %rcx), %ecx 865 TOLOWER_gpr (%rax, %eax) 866 TOLOWER_gpr (%rcx, %ecx) 867 subl %ecx, %eax 868 xorl %r8d, %eax 869 subl %r8d, %eax 870# endif 871L(ret8): 872 ret 873 874# ifdef USE_AS_STRNCMP 875 .p2align 4,, 10 876L(return_page_cross_end_check): 877 andl %r10d, %ecx 878 tzcntl %ecx, %ecx 879 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx 880# ifdef USE_AS_WCSCMP 881 sall $2, %edx 882# endif 883 cmpl %ecx, %edx 884 ja L(return_page_cross_cmp_mem) 885 xorl %eax, %eax 886 ret 887# endif 888 889 890 .p2align 4,, 10 891L(more_2x_vec_till_page_cross): 892 /* If more 2x vec till cross we will complete a full loop 893 iteration here. */ 894 895 VMOVA VEC_SIZE(%rdi), %YMM0 896 VPTESTM %YMM0, %YMM0, %k2 897 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} 898 kmovd %k1, %ecx 899 TESTEQ %ecx 900 jnz L(return_vec_1_end) 901 902# ifdef USE_AS_STRNCMP 903 cmpq $(CHAR_PER_VEC * 2), %rdx 904 jbe L(ret_zero_in_loop_page_cross) 905# endif 906 907 subl $-(VEC_SIZE * 4), %eax 908 909 /* Safe to include comparisons from lower bytes. */ 910 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 911 VPTESTM %YMM0, %YMM0, %k2 912 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} 913 kmovd %k1, %ecx 914 TESTEQ %ecx 915 jnz L(return_vec_page_cross_0) 916 917 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 918 VPTESTM %YMM0, %YMM0, %k2 919 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} 920 kmovd %k1, %ecx 921 TESTEQ %ecx 922 jnz L(return_vec_page_cross_1) 923 924# ifdef USE_AS_STRNCMP 925 /* Must check length here as length might proclude reading next 926 page. */ 927# ifdef USE_AS_WCSCMP 928 /* NB: strcasecmp not used with WCSCMP so this access to r11 is 929 safe. */ 930 movl %eax, %r11d 931 shrl $2, %r11d 932 cmpq %r11, %rdx 933# else 934 cmpq %rax, %rdx 935# endif 936 jbe L(ret_zero_in_loop_page_cross) 937# endif 938 939 /* Finish the loop. */ 940 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 941 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 942 VPMINU %YMM4, %YMM6, %YMM9 943 VPTESTM %YMM9, %YMM9, %k1 944# ifndef USE_AS_STRCASECMP_L 945 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 946 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ 947 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 948# else 949 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 950 TOLOWER_YMM (%YMM4, %YMM5) 951 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 952 TOLOWER_YMM (%YMM6, %YMM7) 953 vpxorq %YMM4, %YMM5, %YMM5 954 vpternlogd $0xde, %YMM7, %YMM5, %YMM6 955# endif 956 VPTESTNM %YMM6, %YMM6, %k0{%k1} 957 kmovd %k0, %LOOP_REG 958 TESTEQ %LOOP_REG 959 jnz L(return_vec_2_3_end) 960 961 /* Best for code size to include ucond-jmp here. Would be faster 962 if this case is hot to duplicate the L(return_vec_2_3_end) code 963 as fall-through and have jump back to loop on mismatch 964 comparison. */ 965 subq $-(VEC_SIZE * 4), %rdi 966 subq $-(VEC_SIZE * 4), %rsi 967 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax 968# ifdef USE_AS_STRNCMP 969 subq $(CHAR_PER_VEC * 4), %rdx 970 ja L(loop_skip_page_cross_check) 971L(ret_zero_in_loop_page_cross): 972 xorl %eax, %eax 973 ret 974# else 975 jmp L(loop_skip_page_cross_check) 976# endif 977 978 979 .p2align 4,, 10 980L(return_vec_page_cross_0): 981 addl $-VEC_SIZE, %eax 982L(return_vec_page_cross_1): 983 tzcntl %ecx, %ecx 984# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP 985 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx 986# ifdef USE_AS_STRNCMP 987# ifdef USE_AS_WCSCMP 988 /* Must divide ecx instead of multiply rdx due to overflow. */ 989 movl %ecx, %eax 990 shrl $2, %eax 991 cmpq %rax, %rdx 992# else 993 cmpq %rcx, %rdx 994# endif 995 jbe L(ret_zero_in_loop_page_cross) 996# endif 997# else 998 addl %eax, %ecx 999# endif 1000 1001# ifdef USE_AS_WCSCMP 1002 movl VEC_OFFSET(%rdi, %rcx), %edx 1003 xorl %eax, %eax 1004 cmpl VEC_OFFSET(%rsi, %rcx), %edx 1005 je L(ret9) 1006 setl %al 1007 negl %eax 1008 xorl %r8d, %eax 1009# else 1010 movzbl VEC_OFFSET(%rdi, %rcx), %eax 1011 movzbl VEC_OFFSET(%rsi, %rcx), %ecx 1012 TOLOWER_gpr (%rax, %eax) 1013 TOLOWER_gpr (%rcx, %ecx) 1014 subl %ecx, %eax 1015 xorl %r8d, %eax 1016 subl %r8d, %eax 1017# endif 1018L(ret9): 1019 ret 1020 1021 1022 .p2align 4,, 10 1023L(page_cross): 1024# ifndef USE_AS_STRNCMP 1025 /* If both are VEC aligned we don't need any special logic here. 1026 Only valid for strcmp where stop condition is guranteed to be 1027 reachable by just reading memory. */ 1028 testl $((VEC_SIZE - 1) << 20), %eax 1029 jz L(no_page_cross) 1030# endif 1031 1032 movl %edi, %eax 1033 movl %esi, %ecx 1034 andl $(PAGE_SIZE - 1), %eax 1035 andl $(PAGE_SIZE - 1), %ecx 1036 1037 xorl %OFFSET_REG, %OFFSET_REG 1038 1039 /* Check which is closer to page cross, s1 or s2. */ 1040 cmpl %eax, %ecx 1041 jg L(page_cross_s2) 1042 1043 /* The previous page cross check has false positives. Check for 1044 true positive as page cross logic is very expensive. */ 1045 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax 1046 jbe L(no_page_cross) 1047 1048 1049 /* Set r8 to not interfere with normal return value (rdi and rsi 1050 did not swap). */ 1051# ifdef USE_AS_WCSCMP 1052 /* any non-zero positive value that doesn't inference with 0x1. 1053 */ 1054 movl $2, %r8d 1055# else 1056 xorl %r8d, %r8d 1057# endif 1058 1059 /* Check if less than 1x VEC till page cross. */ 1060 subl $(VEC_SIZE * 3), %eax 1061 jg L(less_1x_vec_till_page) 1062 1063 1064 /* If more than 1x VEC till page cross, loop throuh safely 1065 loadable memory until within 1x VEC of page cross. */ 1066 .p2align 4,, 8 1067L(page_cross_loop): 1068 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 1069 VPTESTM %YMM0, %YMM0, %k2 1070 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} 1071 kmovd %k1, %ecx 1072 TESTEQ %ecx 1073 jnz L(check_ret_vec_page_cross) 1074 addl $CHAR_PER_VEC, %OFFSET_REG 1075# ifdef USE_AS_STRNCMP 1076 cmpq %OFFSET_REG64, %rdx 1077 jbe L(ret_zero_page_cross) 1078# endif 1079 addl $VEC_SIZE, %eax 1080 jl L(page_cross_loop) 1081 1082# ifdef USE_AS_WCSCMP 1083 shrl $2, %eax 1084# endif 1085 1086 1087 subl %eax, %OFFSET_REG 1088 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed 1089 to not cross page so is safe to load. Since we have already 1090 loaded at least 1 VEC from rsi it is also guranteed to be safe. 1091 */ 1092 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 1093 VPTESTM %YMM0, %YMM0, %k2 1094 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} 1095 1096 kmovd %k1, %ecx 1097# ifdef USE_AS_STRNCMP 1098 leal CHAR_PER_VEC(%OFFSET_REG64), %eax 1099 cmpq %rax, %rdx 1100 jbe L(check_ret_vec_page_cross2) 1101# ifdef USE_AS_WCSCMP 1102 addq $-(CHAR_PER_VEC * 2), %rdx 1103# else 1104 addq %rdi, %rdx 1105# endif 1106# endif 1107 TESTEQ %ecx 1108 jz L(prepare_loop_no_len) 1109 1110 .p2align 4,, 4 1111L(ret_vec_page_cross): 1112# ifndef USE_AS_STRNCMP 1113L(check_ret_vec_page_cross): 1114# endif 1115 tzcntl %ecx, %ecx 1116 addl %OFFSET_REG, %ecx 1117L(ret_vec_page_cross_cont): 1118# ifdef USE_AS_WCSCMP 1119 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx 1120 xorl %eax, %eax 1121 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx 1122 je L(ret12) 1123 setl %al 1124 negl %eax 1125 xorl %r8d, %eax 1126# else 1127 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax 1128 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx 1129 TOLOWER_gpr (%rax, %eax) 1130 TOLOWER_gpr (%rcx, %ecx) 1131 subl %ecx, %eax 1132 xorl %r8d, %eax 1133 subl %r8d, %eax 1134# endif 1135L(ret12): 1136 ret 1137 1138 1139# ifdef USE_AS_STRNCMP 1140 .p2align 4,, 10 1141L(check_ret_vec_page_cross2): 1142 TESTEQ %ecx 1143L(check_ret_vec_page_cross): 1144 tzcntl %ecx, %ecx 1145 addl %OFFSET_REG, %ecx 1146 cmpq %rcx, %rdx 1147 ja L(ret_vec_page_cross_cont) 1148 .p2align 4,, 2 1149L(ret_zero_page_cross): 1150 xorl %eax, %eax 1151 ret 1152# endif 1153 1154 .p2align 4,, 4 1155L(page_cross_s2): 1156 /* Ensure this is a true page cross. */ 1157 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx 1158 jbe L(no_page_cross) 1159 1160 1161 movl %ecx, %eax 1162 movq %rdi, %rcx 1163 movq %rsi, %rdi 1164 movq %rcx, %rsi 1165 1166 /* set r8 to negate return value as rdi and rsi swapped. */ 1167# ifdef USE_AS_WCSCMP 1168 movl $-4, %r8d 1169# else 1170 movl $-1, %r8d 1171# endif 1172 xorl %OFFSET_REG, %OFFSET_REG 1173 1174 /* Check if more than 1x VEC till page cross. */ 1175 subl $(VEC_SIZE * 3), %eax 1176 jle L(page_cross_loop) 1177 1178 .p2align 4,, 6 1179L(less_1x_vec_till_page): 1180# ifdef USE_AS_WCSCMP 1181 shrl $2, %eax 1182# endif 1183 /* Find largest load size we can use. */ 1184 cmpl $(16 / SIZE_OF_CHAR), %eax 1185 ja L(less_16_till_page) 1186 1187 /* Use 16 byte comparison. */ 1188 vmovdqu (%rdi), %xmm0 1189 VPTESTM %xmm0, %xmm0, %k2 1190 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} 1191 kmovd %k1, %ecx 1192# ifdef USE_AS_WCSCMP 1193 subl $0xf, %ecx 1194# else 1195 incw %cx 1196# endif 1197 jnz L(check_ret_vec_page_cross) 1198 movl $(16 / SIZE_OF_CHAR), %OFFSET_REG 1199# ifdef USE_AS_STRNCMP 1200 cmpq %OFFSET_REG64, %rdx 1201 jbe L(ret_zero_page_cross_slow_case0) 1202 subl %eax, %OFFSET_REG 1203# else 1204 /* Explicit check for 16 byte alignment. */ 1205 subl %eax, %OFFSET_REG 1206 jz L(prepare_loop) 1207# endif 1208 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 1209 VPTESTM %xmm0, %xmm0, %k2 1210 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} 1211 kmovd %k1, %ecx 1212# ifdef USE_AS_WCSCMP 1213 subl $0xf, %ecx 1214# else 1215 incw %cx 1216# endif 1217 jnz L(check_ret_vec_page_cross) 1218# ifdef USE_AS_STRNCMP 1219 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG 1220 subq %OFFSET_REG64, %rdx 1221 jbe L(ret_zero_page_cross_slow_case0) 1222 subq $-(CHAR_PER_VEC * 4), %rdx 1223 1224 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1225 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1226# else 1227 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1228 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1229# endif 1230 jmp L(prepare_loop_aligned) 1231 1232# ifdef USE_AS_STRNCMP 1233 .p2align 4,, 2 1234L(ret_zero_page_cross_slow_case0): 1235 xorl %eax, %eax 1236 ret 1237# endif 1238 1239 1240 .p2align 4,, 10 1241L(less_16_till_page): 1242 cmpl $(24 / SIZE_OF_CHAR), %eax 1243 ja L(less_8_till_page) 1244 1245 /* Use 8 byte comparison. */ 1246 vmovq (%rdi), %xmm0 1247 vmovq (%rsi), %xmm1 1248 VPTESTM %xmm0, %xmm0, %k2 1249 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} 1250 kmovd %k1, %ecx 1251# ifdef USE_AS_WCSCMP 1252 subl $0x3, %ecx 1253# else 1254 incb %cl 1255# endif 1256 jnz L(check_ret_vec_page_cross) 1257 1258 1259# ifdef USE_AS_STRNCMP 1260 cmpq $(8 / SIZE_OF_CHAR), %rdx 1261 jbe L(ret_zero_page_cross_slow_case0) 1262# endif 1263 movl $(24 / SIZE_OF_CHAR), %OFFSET_REG 1264 subl %eax, %OFFSET_REG 1265 1266 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 1267 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 1268 VPTESTM %xmm0, %xmm0, %k2 1269 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} 1270 kmovd %k1, %ecx 1271# ifdef USE_AS_WCSCMP 1272 subl $0x3, %ecx 1273# else 1274 incb %cl 1275# endif 1276 jnz L(check_ret_vec_page_cross) 1277 1278 1279# ifdef USE_AS_STRNCMP 1280 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG 1281 subq %OFFSET_REG64, %rdx 1282 jbe L(ret_zero_page_cross_slow_case0) 1283 subq $-(CHAR_PER_VEC * 4), %rdx 1284 1285 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1286 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1287# else 1288 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1289 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1290# endif 1291 jmp L(prepare_loop_aligned) 1292 1293 1294 1295 1296 .p2align 4,, 10 1297L(less_8_till_page): 1298# ifdef USE_AS_WCSCMP 1299 /* If using wchar then this is the only check before we reach 1300 the page boundary. */ 1301 movl (%rdi), %eax 1302 movl (%rsi), %ecx 1303 cmpl %ecx, %eax 1304 jnz L(ret_less_8_wcs) 1305# ifdef USE_AS_STRNCMP 1306 addq $-(CHAR_PER_VEC * 2), %rdx 1307 /* We already checked for len <= 1 so cannot hit that case here. 1308 */ 1309# endif 1310 testl %eax, %eax 1311 jnz L(prepare_loop) 1312 ret 1313 1314 .p2align 4,, 8 1315L(ret_less_8_wcs): 1316 setl %OFFSET_REG8 1317 negl %OFFSET_REG 1318 movl %OFFSET_REG, %eax 1319 xorl %r8d, %eax 1320 ret 1321 1322# else 1323 cmpl $28, %eax 1324 ja L(less_4_till_page) 1325 1326 vmovd (%rdi), %xmm0 1327 vmovd (%rsi), %xmm1 1328 VPTESTM %xmm0, %xmm0, %k2 1329 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} 1330 kmovd %k1, %ecx 1331 subl $0xf, %ecx 1332 jnz L(check_ret_vec_page_cross) 1333 1334# ifdef USE_AS_STRNCMP 1335 cmpq $4, %rdx 1336 jbe L(ret_zero_page_cross_slow_case1) 1337# endif 1338 movl $(28 / SIZE_OF_CHAR), %OFFSET_REG 1339 subl %eax, %OFFSET_REG 1340 1341 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 1342 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 1343 VPTESTM %xmm0, %xmm0, %k2 1344 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} 1345 kmovd %k1, %ecx 1346 subl $0xf, %ecx 1347 jnz L(check_ret_vec_page_cross) 1348# ifdef USE_AS_STRNCMP 1349 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG 1350 subq %OFFSET_REG64, %rdx 1351 jbe L(ret_zero_page_cross_slow_case1) 1352 subq $-(CHAR_PER_VEC * 4), %rdx 1353 1354 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1355 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1356# else 1357 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi 1358 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi 1359# endif 1360 jmp L(prepare_loop_aligned) 1361 1362 1363# ifdef USE_AS_STRNCMP 1364 .p2align 4,, 2 1365L(ret_zero_page_cross_slow_case1): 1366 xorl %eax, %eax 1367 ret 1368# endif 1369 1370 .p2align 4,, 10 1371L(less_4_till_page): 1372 subq %rdi, %rsi 1373 /* Extremely slow byte comparison loop. */ 1374L(less_4_loop): 1375 movzbl (%rdi), %eax 1376 movzbl (%rsi, %rdi), %ecx 1377 TOLOWER_gpr (%rax, %eax) 1378 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) 1379 subl %BYTE_LOOP_REG, %eax 1380 jnz L(ret_less_4_loop) 1381 testl %ecx, %ecx 1382 jz L(ret_zero_4_loop) 1383# ifdef USE_AS_STRNCMP 1384 decq %rdx 1385 jz L(ret_zero_4_loop) 1386# endif 1387 incq %rdi 1388 /* end condition is reach page boundary (rdi is aligned). */ 1389 testl $31, %edi 1390 jnz L(less_4_loop) 1391 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi 1392 addq $-(VEC_SIZE * 4), %rdi 1393# ifdef USE_AS_STRNCMP 1394 subq $-(CHAR_PER_VEC * 4), %rdx 1395# endif 1396 jmp L(prepare_loop_aligned) 1397 1398L(ret_zero_4_loop): 1399 xorl %eax, %eax 1400 ret 1401L(ret_less_4_loop): 1402 xorl %r8d, %eax 1403 subl %r8d, %eax 1404 ret 1405# endif 1406 cfi_endproc 1407 .size STRCMP, .-STRCMP 1408#endif 1409