1/* strcmp optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* Continue building at ISA level 2 as the strcmp-sse42 is not always 22 preferable for ISA level == 2 CPUs. */ 23#if ISA_SHOULD_BUILD (2) 24 25# define STRCMP_ISA _sse2 26# include "strcmp-naming.h" 27 28# include <sysdep.h> 29 30# undef UPDATE_STRNCMP_COUNTER 31 32# ifndef LABEL 33# define LABEL(l) L(l) 34# endif 35 36# ifdef USE_AS_STRNCMP 37/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz 38 if the new counter > the old one or is 0. */ 39# define UPDATE_STRNCMP_COUNTER \ 40 /* calculate left number to compare */ \ 41 lea -16(%rcx, %r11), %r9; \ 42 cmp %r9, %r11; \ 43 jb LABEL(strcmp_exitz); \ 44 test %r9, %r9; \ 45 je LABEL(strcmp_exitz); \ 46 mov %r9, %r11 47 48# elif defined USE_AS_STRCASECMP_L 49# include "locale-defines.h" 50 51# define UPDATE_STRNCMP_COUNTER 52# elif defined USE_AS_STRNCASECMP_L 53# include "locale-defines.h" 54 55# define UPDATE_STRNCMP_COUNTER \ 56 /* calculate left number to compare */ \ 57 lea -16(%rcx, %r11), %r9; \ 58 cmp %r9, %r11; \ 59 jb LABEL(strcmp_exitz); \ 60 test %r9, %r9; \ 61 je LABEL(strcmp_exitz); \ 62 mov %r9, %r11 63# else 64# define UPDATE_STRNCMP_COUNTER 65# endif 66 67 .text 68# ifdef USE_AS_STRCASECMP_L 69# ifndef ENTRY2 70# define ENTRY2(name) ENTRY (name) 71# define END2(name) END (name) 72# endif 73 74ENTRY2 (STRCASECMP) 75 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 76 mov %fs:(%rax),%RDX_LP 77 78 /* Either 1 or 5 bytes (dependeing if CET is enabled). */ 79 .p2align 4 80END2 (STRCASECMP) 81 /* FALLTHROUGH to strcasecmp_l. */ 82# elif defined USE_AS_STRNCASECMP_L 83# ifndef ENTRY2 84# define ENTRY2(name) ENTRY (name) 85# define END2(name) END (name) 86# endif 87 88ENTRY2 (STRCASECMP) 89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 90 mov %fs:(%rax),%RCX_LP 91 92 /* Either 1 or 5 bytes (dependeing if CET is enabled). */ 93 .p2align 4 94END2 (STRCASECMP) 95 /* FALLTHROUGH to strncasecmp_l. */ 96# endif 97 98ENTRY (STRCMP) 99# ifdef USE_AS_STRCASECMP_L 100 /* We have to fall back on the C implementation for locales 101 with encodings not matching ASCII for single bytes. */ 102# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 103 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP 104# else 105 mov (%rdx), %RAX_LP 106# endif 107 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 108 jne __strcasecmp_l_nonascii 109# elif defined USE_AS_STRNCASECMP_L 110 /* We have to fall back on the C implementation for locales 111 with encodings not matching ASCII for single bytes. */ 112# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 113 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP 114# else 115 mov (%rcx), %RAX_LP 116# endif 117 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 118 jne __strncasecmp_l_nonascii 119# endif 120 121/* 122 * This implementation uses SSE to compare up to 16 bytes at a time. 123 */ 124# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 125 test %RDX_LP, %RDX_LP 126 je LABEL(strcmp_exitz) 127 cmp $1, %RDX_LP 128 je LABEL(Byte0) 129 mov %RDX_LP, %R11_LP 130# endif 131 mov %esi, %ecx 132 mov %edi, %eax 133/* Use 64bit AND here to avoid long NOP padding. */ 134 and $0x3f, %rcx /* rsi alignment in cache line */ 135 and $0x3f, %rax /* rdi alignment in cache line */ 136# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 137 .section .rodata.cst16,"aM",@progbits,16 138 .align 16 139.Llcase_min: 140 .quad 0x3f3f3f3f3f3f3f3f 141 .quad 0x3f3f3f3f3f3f3f3f 142.Llcase_max: 143 .quad 0x9999999999999999 144 .quad 0x9999999999999999 145.Lcase_add: 146 .quad 0x2020202020202020 147 .quad 0x2020202020202020 148 .previous 149 movdqa .Llcase_min(%rip), %xmm5 150# define LCASE_MIN_reg %xmm5 151 movdqa .Llcase_max(%rip), %xmm6 152# define LCASE_MAX_reg %xmm6 153 movdqa .Lcase_add(%rip), %xmm7 154# define CASE_ADD_reg %xmm7 155# endif 156 cmp $0x30, %ecx 157 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ 158 cmp $0x30, %eax 159 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ 160 movlpd (%rdi), %xmm1 161 movlpd (%rsi), %xmm2 162 movhpd 8(%rdi), %xmm1 163 movhpd 8(%rsi), %xmm2 164# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 165# define TOLOWER(reg1, reg2) \ 166 movdqa LCASE_MIN_reg, %xmm8; \ 167 movdqa LCASE_MIN_reg, %xmm9; \ 168 paddb reg1, %xmm8; \ 169 paddb reg2, %xmm9; \ 170 pcmpgtb LCASE_MAX_reg, %xmm8; \ 171 pcmpgtb LCASE_MAX_reg, %xmm9; \ 172 pandn CASE_ADD_reg, %xmm8; \ 173 pandn CASE_ADD_reg, %xmm9; \ 174 paddb %xmm8, reg1; \ 175 paddb %xmm9, reg2 176 TOLOWER (%xmm1, %xmm2) 177# else 178# define TOLOWER(reg1, reg2) 179# endif 180 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 181 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 182 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 183 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 184 pmovmskb %xmm1, %edx 185 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 186 jnz LABEL(less16bytes) /* If not, find different value or null char */ 187# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 188 sub $16, %r11 189 jbe LABEL(strcmp_exitz) /* finish comparision */ 190# endif 191 add $16, %rsi /* prepare to search next 16 bytes */ 192 add $16, %rdi /* prepare to search next 16 bytes */ 193 194 /* 195 * Determine source and destination string offsets from 16-byte alignment. 196 * Use relative offset difference between the two to determine which case 197 * below to use. 198 */ 199 .p2align 4 200LABEL(crosscache): 201 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ 202 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ 203 mov $0xffff, %edx /* for equivalent offset */ 204 xor %r8d, %r8d 205 and $0xf, %ecx /* offset of rsi */ 206 and $0xf, %eax /* offset of rdi */ 207 cmp %eax, %ecx 208 je LABEL(ashr_0) /* rsi and rdi relative offset same */ 209 ja LABEL(bigger) 210 mov %edx, %r8d /* r8d is offset flag for exit tail */ 211 xchg %ecx, %eax 212 xchg %rsi, %rdi 213LABEL(bigger): 214 lea 15(%rax), %r9 215 sub %rcx, %r9 216 lea LABEL(unaligned_table)(%rip), %r10 217 movslq (%r10, %r9,4), %r9 218 lea (%r10, %r9), %r10 219 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ 220 221/* 222 * The following cases will be handled by ashr_0 223 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 224 * n(0~15) n(0~15) 15(15+ n-n) ashr_0 225 */ 226 .p2align 4 227LABEL(ashr_0): 228 229 movdqa (%rsi), %xmm1 230 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 231 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 232# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 233 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 234# else 235 movdqa (%rdi), %xmm2 236 TOLOWER (%xmm1, %xmm2) 237 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ 238# endif 239 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 240 pmovmskb %xmm1, %r9d 241 shr %cl, %edx /* adjust 0xffff for offset */ 242 shr %cl, %r9d /* adjust for 16-byte offset */ 243 sub %r9d, %edx 244 /* 245 * edx must be the same with r9d if in left byte (16-rcx) is equal to 246 * the start from (16-rax) and no null char was seen. 247 */ 248 jne LABEL(less32bytes) /* mismatch or null char */ 249 UPDATE_STRNCMP_COUNTER 250 mov $16, %rcx 251 mov $16, %r9 252 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 253 254 /* 255 * Now both strings are aligned at 16-byte boundary. Loop over strings 256 * checking 32-bytes per iteration. 257 */ 258 .p2align 4 259LABEL(loop_ashr_0): 260 movdqa (%rsi, %rcx), %xmm1 261 movdqa (%rdi, %rcx), %xmm2 262 TOLOWER (%xmm1, %xmm2) 263 264 pcmpeqb %xmm1, %xmm0 265 pcmpeqb %xmm2, %xmm1 266 psubb %xmm0, %xmm1 267 pmovmskb %xmm1, %edx 268 sub $0xffff, %edx 269 jnz LABEL(exit) /* mismatch or null char seen */ 270 271# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 272 sub $16, %r11 273 jbe LABEL(strcmp_exitz) 274# endif 275 add $16, %rcx 276 movdqa (%rsi, %rcx), %xmm1 277 movdqa (%rdi, %rcx), %xmm2 278 TOLOWER (%xmm1, %xmm2) 279 280 pcmpeqb %xmm1, %xmm0 281 pcmpeqb %xmm2, %xmm1 282 psubb %xmm0, %xmm1 283 pmovmskb %xmm1, %edx 284 sub $0xffff, %edx 285 jnz LABEL(exit) 286# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 287 sub $16, %r11 288 jbe LABEL(strcmp_exitz) 289# endif 290 add $16, %rcx 291 jmp LABEL(loop_ashr_0) 292 293/* 294 * The following cases will be handled by ashr_1 295 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 296 * n(15) n -15 0(15 +(n-15) - n) ashr_1 297 */ 298 .p2align 4 299LABEL(ashr_1): 300 pxor %xmm0, %xmm0 301 movdqa (%rdi), %xmm2 302 movdqa (%rsi), %xmm1 303 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 304 pslldq $15, %xmm2 /* shift first string to align with second */ 305 TOLOWER (%xmm1, %xmm2) 306 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 307 psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 308 pmovmskb %xmm2, %r9d 309 shr %cl, %edx /* adjust 0xffff for offset */ 310 shr %cl, %r9d /* adjust for 16-byte offset */ 311 sub %r9d, %edx 312 jnz LABEL(less32bytes) /* mismatch or null char seen */ 313 movdqa (%rdi), %xmm3 314 UPDATE_STRNCMP_COUNTER 315 316 pxor %xmm0, %xmm0 317 mov $16, %rcx /* index for loads*/ 318 mov $1, %r9d /* byte position left over from less32bytes case */ 319 /* 320 * Setup %r10 value allows us to detect crossing a page boundary. 321 * When %r10 goes positive we have crossed a page boundary and 322 * need to do a nibble. 323 */ 324 lea 1(%rdi), %r10 325 and $0xfff, %r10 /* offset into 4K page */ 326 sub $0x1000, %r10 /* subtract 4K pagesize */ 327 328 .p2align 4 329LABEL(loop_ashr_1): 330 add $16, %r10 331 jg LABEL(nibble_ashr_1) /* cross page boundary */ 332 333LABEL(gobble_ashr_1): 334 movdqa (%rsi, %rcx), %xmm1 335 movdqa (%rdi, %rcx), %xmm2 336 movdqa %xmm2, %xmm4 /* store for next cycle */ 337 338 psrldq $1, %xmm3 339 pslldq $15, %xmm2 340 por %xmm3, %xmm2 /* merge into one 16byte value */ 341 342 TOLOWER (%xmm1, %xmm2) 343 344 pcmpeqb %xmm1, %xmm0 345 pcmpeqb %xmm2, %xmm1 346 psubb %xmm0, %xmm1 347 pmovmskb %xmm1, %edx 348 sub $0xffff, %edx 349 jnz LABEL(exit) 350 351# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 352 sub $16, %r11 353 jbe LABEL(strcmp_exitz) 354# endif 355 add $16, %rcx 356 movdqa %xmm4, %xmm3 357 358 add $16, %r10 359 jg LABEL(nibble_ashr_1) /* cross page boundary */ 360 361 movdqa (%rsi, %rcx), %xmm1 362 movdqa (%rdi, %rcx), %xmm2 363 movdqa %xmm2, %xmm4 /* store for next cycle */ 364 365 psrldq $1, %xmm3 366 pslldq $15, %xmm2 367 por %xmm3, %xmm2 /* merge into one 16byte value */ 368 369 TOLOWER (%xmm1, %xmm2) 370 371 pcmpeqb %xmm1, %xmm0 372 pcmpeqb %xmm2, %xmm1 373 psubb %xmm0, %xmm1 374 pmovmskb %xmm1, %edx 375 sub $0xffff, %edx 376 jnz LABEL(exit) 377 378# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 379 sub $16, %r11 380 jbe LABEL(strcmp_exitz) 381# endif 382 add $16, %rcx 383 movdqa %xmm4, %xmm3 384 jmp LABEL(loop_ashr_1) 385 386 /* 387 * Nibble avoids loads across page boundary. This is to avoid a potential 388 * access into unmapped memory. 389 */ 390 .p2align 4 391LABEL(nibble_ashr_1): 392 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ 393 pmovmskb %xmm0, %edx 394 test $0xfffe, %edx 395 jnz LABEL(ashr_1_exittail) /* find null char*/ 396 397# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 398 cmp $15, %r11 399 jbe LABEL(ashr_1_exittail) 400# endif 401 402 pxor %xmm0, %xmm0 403 sub $0x1000, %r10 /* substract 4K from %r10 */ 404 jmp LABEL(gobble_ashr_1) 405 406 /* 407 * Once find null char, determine if there is a string mismatch 408 * before the null char. 409 */ 410 .p2align 4 411LABEL(ashr_1_exittail): 412 movdqa (%rsi, %rcx), %xmm1 413 psrldq $1, %xmm0 414 psrldq $1, %xmm3 415 jmp LABEL(aftertail) 416 417/* 418 * The following cases will be handled by ashr_2 419 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 420 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 421 */ 422 .p2align 4 423LABEL(ashr_2): 424 pxor %xmm0, %xmm0 425 movdqa (%rdi), %xmm2 426 movdqa (%rsi), %xmm1 427 pcmpeqb %xmm1, %xmm0 428 pslldq $14, %xmm2 429 TOLOWER (%xmm1, %xmm2) 430 pcmpeqb %xmm1, %xmm2 431 psubb %xmm0, %xmm2 432 pmovmskb %xmm2, %r9d 433 shr %cl, %edx 434 shr %cl, %r9d 435 sub %r9d, %edx 436 jnz LABEL(less32bytes) 437 movdqa (%rdi), %xmm3 438 UPDATE_STRNCMP_COUNTER 439 440 pxor %xmm0, %xmm0 441 mov $16, %rcx /* index for loads */ 442 mov $2, %r9d /* byte position left over from less32bytes case */ 443 /* 444 * Setup %r10 value allows us to detect crossing a page boundary. 445 * When %r10 goes positive we have crossed a page boundary and 446 * need to do a nibble. 447 */ 448 lea 2(%rdi), %r10 449 and $0xfff, %r10 /* offset into 4K page */ 450 sub $0x1000, %r10 /* subtract 4K pagesize */ 451 452 .p2align 4 453LABEL(loop_ashr_2): 454 add $16, %r10 455 jg LABEL(nibble_ashr_2) 456 457LABEL(gobble_ashr_2): 458 movdqa (%rsi, %rcx), %xmm1 459 movdqa (%rdi, %rcx), %xmm2 460 movdqa %xmm2, %xmm4 461 462 psrldq $2, %xmm3 463 pslldq $14, %xmm2 464 por %xmm3, %xmm2 /* merge into one 16byte value */ 465 466 TOLOWER (%xmm1, %xmm2) 467 468 pcmpeqb %xmm1, %xmm0 469 pcmpeqb %xmm2, %xmm1 470 psubb %xmm0, %xmm1 471 pmovmskb %xmm1, %edx 472 sub $0xffff, %edx 473 jnz LABEL(exit) 474 475# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 476 sub $16, %r11 477 jbe LABEL(strcmp_exitz) 478# endif 479 480 add $16, %rcx 481 movdqa %xmm4, %xmm3 482 483 add $16, %r10 484 jg LABEL(nibble_ashr_2) /* cross page boundary */ 485 486 movdqa (%rsi, %rcx), %xmm1 487 movdqa (%rdi, %rcx), %xmm2 488 movdqa %xmm2, %xmm4 489 490 psrldq $2, %xmm3 491 pslldq $14, %xmm2 492 por %xmm3, %xmm2 /* merge into one 16byte value */ 493 494 TOLOWER (%xmm1, %xmm2) 495 496 pcmpeqb %xmm1, %xmm0 497 pcmpeqb %xmm2, %xmm1 498 psubb %xmm0, %xmm1 499 pmovmskb %xmm1, %edx 500 sub $0xffff, %edx 501 jnz LABEL(exit) 502 503# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 504 sub $16, %r11 505 jbe LABEL(strcmp_exitz) 506# endif 507 508 add $16, %rcx 509 movdqa %xmm4, %xmm3 510 jmp LABEL(loop_ashr_2) 511 512 .p2align 4 513LABEL(nibble_ashr_2): 514 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 515 pmovmskb %xmm0, %edx 516 test $0xfffc, %edx 517 jnz LABEL(ashr_2_exittail) 518 519# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 520 cmp $14, %r11 521 jbe LABEL(ashr_2_exittail) 522# endif 523 524 pxor %xmm0, %xmm0 525 sub $0x1000, %r10 526 jmp LABEL(gobble_ashr_2) 527 528 .p2align 4 529LABEL(ashr_2_exittail): 530 movdqa (%rsi, %rcx), %xmm1 531 psrldq $2, %xmm0 532 psrldq $2, %xmm3 533 jmp LABEL(aftertail) 534 535/* 536 * The following cases will be handled by ashr_3 537 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 538 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 539 */ 540 .p2align 4 541LABEL(ashr_3): 542 pxor %xmm0, %xmm0 543 movdqa (%rdi), %xmm2 544 movdqa (%rsi), %xmm1 545 pcmpeqb %xmm1, %xmm0 546 pslldq $13, %xmm2 547 TOLOWER (%xmm1, %xmm2) 548 pcmpeqb %xmm1, %xmm2 549 psubb %xmm0, %xmm2 550 pmovmskb %xmm2, %r9d 551 shr %cl, %edx 552 shr %cl, %r9d 553 sub %r9d, %edx 554 jnz LABEL(less32bytes) 555 movdqa (%rdi), %xmm3 556 557 UPDATE_STRNCMP_COUNTER 558 559 pxor %xmm0, %xmm0 560 mov $16, %rcx /* index for loads */ 561 mov $3, %r9d /* byte position left over from less32bytes case */ 562 /* 563 * Setup %r10 value allows us to detect crossing a page boundary. 564 * When %r10 goes positive we have crossed a page boundary and 565 * need to do a nibble. 566 */ 567 lea 3(%rdi), %r10 568 and $0xfff, %r10 /* offset into 4K page */ 569 sub $0x1000, %r10 /* subtract 4K pagesize */ 570 571 .p2align 4 572LABEL(loop_ashr_3): 573 add $16, %r10 574 jg LABEL(nibble_ashr_3) 575 576LABEL(gobble_ashr_3): 577 movdqa (%rsi, %rcx), %xmm1 578 movdqa (%rdi, %rcx), %xmm2 579 movdqa %xmm2, %xmm4 580 581 psrldq $3, %xmm3 582 pslldq $13, %xmm2 583 por %xmm3, %xmm2 /* merge into one 16byte value */ 584 585 TOLOWER (%xmm1, %xmm2) 586 587 pcmpeqb %xmm1, %xmm0 588 pcmpeqb %xmm2, %xmm1 589 psubb %xmm0, %xmm1 590 pmovmskb %xmm1, %edx 591 sub $0xffff, %edx 592 jnz LABEL(exit) 593 594# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 595 sub $16, %r11 596 jbe LABEL(strcmp_exitz) 597# endif 598 599 add $16, %rcx 600 movdqa %xmm4, %xmm3 601 602 add $16, %r10 603 jg LABEL(nibble_ashr_3) /* cross page boundary */ 604 605 movdqa (%rsi, %rcx), %xmm1 606 movdqa (%rdi, %rcx), %xmm2 607 movdqa %xmm2, %xmm4 608 609 psrldq $3, %xmm3 610 pslldq $13, %xmm2 611 por %xmm3, %xmm2 /* merge into one 16byte value */ 612 613 TOLOWER (%xmm1, %xmm2) 614 615 pcmpeqb %xmm1, %xmm0 616 pcmpeqb %xmm2, %xmm1 617 psubb %xmm0, %xmm1 618 pmovmskb %xmm1, %edx 619 sub $0xffff, %edx 620 jnz LABEL(exit) 621 622# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 623 sub $16, %r11 624 jbe LABEL(strcmp_exitz) 625# endif 626 627 add $16, %rcx 628 movdqa %xmm4, %xmm3 629 jmp LABEL(loop_ashr_3) 630 631 .p2align 4 632LABEL(nibble_ashr_3): 633 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 634 pmovmskb %xmm0, %edx 635 test $0xfff8, %edx 636 jnz LABEL(ashr_3_exittail) 637 638# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 639 cmp $13, %r11 640 jbe LABEL(ashr_3_exittail) 641# endif 642 643 pxor %xmm0, %xmm0 644 sub $0x1000, %r10 645 jmp LABEL(gobble_ashr_3) 646 647 .p2align 4 648LABEL(ashr_3_exittail): 649 movdqa (%rsi, %rcx), %xmm1 650 psrldq $3, %xmm0 651 psrldq $3, %xmm3 652 jmp LABEL(aftertail) 653 654/* 655 * The following cases will be handled by ashr_4 656 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 657 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 658 */ 659 .p2align 4 660LABEL(ashr_4): 661 pxor %xmm0, %xmm0 662 movdqa (%rdi), %xmm2 663 movdqa (%rsi), %xmm1 664 pcmpeqb %xmm1, %xmm0 665 pslldq $12, %xmm2 666 TOLOWER (%xmm1, %xmm2) 667 pcmpeqb %xmm1, %xmm2 668 psubb %xmm0, %xmm2 669 pmovmskb %xmm2, %r9d 670 shr %cl, %edx 671 shr %cl, %r9d 672 sub %r9d, %edx 673 jnz LABEL(less32bytes) 674 movdqa (%rdi), %xmm3 675 676 UPDATE_STRNCMP_COUNTER 677 678 pxor %xmm0, %xmm0 679 mov $16, %rcx /* index for loads */ 680 mov $4, %r9d /* byte position left over from less32bytes case */ 681 /* 682 * Setup %r10 value allows us to detect crossing a page boundary. 683 * When %r10 goes positive we have crossed a page boundary and 684 * need to do a nibble. 685 */ 686 lea 4(%rdi), %r10 687 and $0xfff, %r10 /* offset into 4K page */ 688 sub $0x1000, %r10 /* subtract 4K pagesize */ 689 690 .p2align 4 691LABEL(loop_ashr_4): 692 add $16, %r10 693 jg LABEL(nibble_ashr_4) 694 695LABEL(gobble_ashr_4): 696 movdqa (%rsi, %rcx), %xmm1 697 movdqa (%rdi, %rcx), %xmm2 698 movdqa %xmm2, %xmm4 699 700 psrldq $4, %xmm3 701 pslldq $12, %xmm2 702 por %xmm3, %xmm2 /* merge into one 16byte value */ 703 704 TOLOWER (%xmm1, %xmm2) 705 706 pcmpeqb %xmm1, %xmm0 707 pcmpeqb %xmm2, %xmm1 708 psubb %xmm0, %xmm1 709 pmovmskb %xmm1, %edx 710 sub $0xffff, %edx 711 jnz LABEL(exit) 712 713# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 714 sub $16, %r11 715 jbe LABEL(strcmp_exitz) 716# endif 717 718 add $16, %rcx 719 movdqa %xmm4, %xmm3 720 721 add $16, %r10 722 jg LABEL(nibble_ashr_4) /* cross page boundary */ 723 724 movdqa (%rsi, %rcx), %xmm1 725 movdqa (%rdi, %rcx), %xmm2 726 movdqa %xmm2, %xmm4 727 728 psrldq $4, %xmm3 729 pslldq $12, %xmm2 730 por %xmm3, %xmm2 /* merge into one 16byte value */ 731 732 TOLOWER (%xmm1, %xmm2) 733 734 pcmpeqb %xmm1, %xmm0 735 pcmpeqb %xmm2, %xmm1 736 psubb %xmm0, %xmm1 737 pmovmskb %xmm1, %edx 738 sub $0xffff, %edx 739 jnz LABEL(exit) 740 741# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 742 sub $16, %r11 743 jbe LABEL(strcmp_exitz) 744# endif 745 746 add $16, %rcx 747 movdqa %xmm4, %xmm3 748 jmp LABEL(loop_ashr_4) 749 750 .p2align 4 751LABEL(nibble_ashr_4): 752 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 753 pmovmskb %xmm0, %edx 754 test $0xfff0, %edx 755 jnz LABEL(ashr_4_exittail) 756 757# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 758 cmp $12, %r11 759 jbe LABEL(ashr_4_exittail) 760# endif 761 762 pxor %xmm0, %xmm0 763 sub $0x1000, %r10 764 jmp LABEL(gobble_ashr_4) 765 766 .p2align 4 767LABEL(ashr_4_exittail): 768 movdqa (%rsi, %rcx), %xmm1 769 psrldq $4, %xmm0 770 psrldq $4, %xmm3 771 jmp LABEL(aftertail) 772 773/* 774 * The following cases will be handled by ashr_5 775 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 776 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 777 */ 778 .p2align 4 779LABEL(ashr_5): 780 pxor %xmm0, %xmm0 781 movdqa (%rdi), %xmm2 782 movdqa (%rsi), %xmm1 783 pcmpeqb %xmm1, %xmm0 784 pslldq $11, %xmm2 785 TOLOWER (%xmm1, %xmm2) 786 pcmpeqb %xmm1, %xmm2 787 psubb %xmm0, %xmm2 788 pmovmskb %xmm2, %r9d 789 shr %cl, %edx 790 shr %cl, %r9d 791 sub %r9d, %edx 792 jnz LABEL(less32bytes) 793 movdqa (%rdi), %xmm3 794 795 UPDATE_STRNCMP_COUNTER 796 797 pxor %xmm0, %xmm0 798 mov $16, %rcx /* index for loads */ 799 mov $5, %r9d /* byte position left over from less32bytes case */ 800 /* 801 * Setup %r10 value allows us to detect crossing a page boundary. 802 * When %r10 goes positive we have crossed a page boundary and 803 * need to do a nibble. 804 */ 805 lea 5(%rdi), %r10 806 and $0xfff, %r10 /* offset into 4K page */ 807 sub $0x1000, %r10 /* subtract 4K pagesize */ 808 809 .p2align 4 810LABEL(loop_ashr_5): 811 add $16, %r10 812 jg LABEL(nibble_ashr_5) 813 814LABEL(gobble_ashr_5): 815 movdqa (%rsi, %rcx), %xmm1 816 movdqa (%rdi, %rcx), %xmm2 817 movdqa %xmm2, %xmm4 818 819 psrldq $5, %xmm3 820 pslldq $11, %xmm2 821 por %xmm3, %xmm2 /* merge into one 16byte value */ 822 823 TOLOWER (%xmm1, %xmm2) 824 825 pcmpeqb %xmm1, %xmm0 826 pcmpeqb %xmm2, %xmm1 827 psubb %xmm0, %xmm1 828 pmovmskb %xmm1, %edx 829 sub $0xffff, %edx 830 jnz LABEL(exit) 831 832# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 833 sub $16, %r11 834 jbe LABEL(strcmp_exitz) 835# endif 836 837 add $16, %rcx 838 movdqa %xmm4, %xmm3 839 840 add $16, %r10 841 jg LABEL(nibble_ashr_5) /* cross page boundary */ 842 843 movdqa (%rsi, %rcx), %xmm1 844 movdqa (%rdi, %rcx), %xmm2 845 movdqa %xmm2, %xmm4 846 847 psrldq $5, %xmm3 848 pslldq $11, %xmm2 849 por %xmm3, %xmm2 /* merge into one 16byte value */ 850 851 TOLOWER (%xmm1, %xmm2) 852 853 pcmpeqb %xmm1, %xmm0 854 pcmpeqb %xmm2, %xmm1 855 psubb %xmm0, %xmm1 856 pmovmskb %xmm1, %edx 857 sub $0xffff, %edx 858 jnz LABEL(exit) 859 860# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 861 sub $16, %r11 862 jbe LABEL(strcmp_exitz) 863# endif 864 865 add $16, %rcx 866 movdqa %xmm4, %xmm3 867 jmp LABEL(loop_ashr_5) 868 869 .p2align 4 870LABEL(nibble_ashr_5): 871 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 872 pmovmskb %xmm0, %edx 873 test $0xffe0, %edx 874 jnz LABEL(ashr_5_exittail) 875 876# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 877 cmp $11, %r11 878 jbe LABEL(ashr_5_exittail) 879# endif 880 881 pxor %xmm0, %xmm0 882 sub $0x1000, %r10 883 jmp LABEL(gobble_ashr_5) 884 885 .p2align 4 886LABEL(ashr_5_exittail): 887 movdqa (%rsi, %rcx), %xmm1 888 psrldq $5, %xmm0 889 psrldq $5, %xmm3 890 jmp LABEL(aftertail) 891 892/* 893 * The following cases will be handled by ashr_6 894 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 895 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 896 */ 897 .p2align 4 898LABEL(ashr_6): 899 pxor %xmm0, %xmm0 900 movdqa (%rdi), %xmm2 901 movdqa (%rsi), %xmm1 902 pcmpeqb %xmm1, %xmm0 903 pslldq $10, %xmm2 904 TOLOWER (%xmm1, %xmm2) 905 pcmpeqb %xmm1, %xmm2 906 psubb %xmm0, %xmm2 907 pmovmskb %xmm2, %r9d 908 shr %cl, %edx 909 shr %cl, %r9d 910 sub %r9d, %edx 911 jnz LABEL(less32bytes) 912 movdqa (%rdi), %xmm3 913 914 UPDATE_STRNCMP_COUNTER 915 916 pxor %xmm0, %xmm0 917 mov $16, %rcx /* index for loads */ 918 mov $6, %r9d /* byte position left over from less32bytes case */ 919 /* 920 * Setup %r10 value allows us to detect crossing a page boundary. 921 * When %r10 goes positive we have crossed a page boundary and 922 * need to do a nibble. 923 */ 924 lea 6(%rdi), %r10 925 and $0xfff, %r10 /* offset into 4K page */ 926 sub $0x1000, %r10 /* subtract 4K pagesize */ 927 928 .p2align 4 929LABEL(loop_ashr_6): 930 add $16, %r10 931 jg LABEL(nibble_ashr_6) 932 933LABEL(gobble_ashr_6): 934 movdqa (%rsi, %rcx), %xmm1 935 movdqa (%rdi, %rcx), %xmm2 936 movdqa %xmm2, %xmm4 937 938 psrldq $6, %xmm3 939 pslldq $10, %xmm2 940 por %xmm3, %xmm2 /* merge into one 16byte value */ 941 942 TOLOWER (%xmm1, %xmm2) 943 944 pcmpeqb %xmm1, %xmm0 945 pcmpeqb %xmm2, %xmm1 946 psubb %xmm0, %xmm1 947 pmovmskb %xmm1, %edx 948 sub $0xffff, %edx 949 jnz LABEL(exit) 950 951# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 952 sub $16, %r11 953 jbe LABEL(strcmp_exitz) 954# endif 955 956 add $16, %rcx 957 movdqa %xmm4, %xmm3 958 959 add $16, %r10 960 jg LABEL(nibble_ashr_6) /* cross page boundary */ 961 962 movdqa (%rsi, %rcx), %xmm1 963 movdqa (%rdi, %rcx), %xmm2 964 movdqa %xmm2, %xmm4 965 966 psrldq $6, %xmm3 967 pslldq $10, %xmm2 968 por %xmm3, %xmm2 /* merge into one 16byte value */ 969 970 TOLOWER (%xmm1, %xmm2) 971 972 pcmpeqb %xmm1, %xmm0 973 pcmpeqb %xmm2, %xmm1 974 psubb %xmm0, %xmm1 975 pmovmskb %xmm1, %edx 976 sub $0xffff, %edx 977 jnz LABEL(exit) 978 979# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 980 sub $16, %r11 981 jbe LABEL(strcmp_exitz) 982# endif 983 984 add $16, %rcx 985 movdqa %xmm4, %xmm3 986 jmp LABEL(loop_ashr_6) 987 988 .p2align 4 989LABEL(nibble_ashr_6): 990 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 991 pmovmskb %xmm0, %edx 992 test $0xffc0, %edx 993 jnz LABEL(ashr_6_exittail) 994 995# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 996 cmp $10, %r11 997 jbe LABEL(ashr_6_exittail) 998# endif 999 1000 pxor %xmm0, %xmm0 1001 sub $0x1000, %r10 1002 jmp LABEL(gobble_ashr_6) 1003 1004 .p2align 4 1005LABEL(ashr_6_exittail): 1006 movdqa (%rsi, %rcx), %xmm1 1007 psrldq $6, %xmm0 1008 psrldq $6, %xmm3 1009 jmp LABEL(aftertail) 1010 1011/* 1012 * The following cases will be handled by ashr_7 1013 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1014 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 1015 */ 1016 .p2align 4 1017LABEL(ashr_7): 1018 pxor %xmm0, %xmm0 1019 movdqa (%rdi), %xmm2 1020 movdqa (%rsi), %xmm1 1021 pcmpeqb %xmm1, %xmm0 1022 pslldq $9, %xmm2 1023 TOLOWER (%xmm1, %xmm2) 1024 pcmpeqb %xmm1, %xmm2 1025 psubb %xmm0, %xmm2 1026 pmovmskb %xmm2, %r9d 1027 shr %cl, %edx 1028 shr %cl, %r9d 1029 sub %r9d, %edx 1030 jnz LABEL(less32bytes) 1031 movdqa (%rdi), %xmm3 1032 1033 UPDATE_STRNCMP_COUNTER 1034 1035 pxor %xmm0, %xmm0 1036 mov $16, %rcx /* index for loads */ 1037 mov $7, %r9d /* byte position left over from less32bytes case */ 1038 /* 1039 * Setup %r10 value allows us to detect crossing a page boundary. 1040 * When %r10 goes positive we have crossed a page boundary and 1041 * need to do a nibble. 1042 */ 1043 lea 7(%rdi), %r10 1044 and $0xfff, %r10 /* offset into 4K page */ 1045 sub $0x1000, %r10 /* subtract 4K pagesize */ 1046 1047 .p2align 4 1048LABEL(loop_ashr_7): 1049 add $16, %r10 1050 jg LABEL(nibble_ashr_7) 1051 1052LABEL(gobble_ashr_7): 1053 movdqa (%rsi, %rcx), %xmm1 1054 movdqa (%rdi, %rcx), %xmm2 1055 movdqa %xmm2, %xmm4 1056 1057 psrldq $7, %xmm3 1058 pslldq $9, %xmm2 1059 por %xmm3, %xmm2 /* merge into one 16byte value */ 1060 1061 TOLOWER (%xmm1, %xmm2) 1062 1063 pcmpeqb %xmm1, %xmm0 1064 pcmpeqb %xmm2, %xmm1 1065 psubb %xmm0, %xmm1 1066 pmovmskb %xmm1, %edx 1067 sub $0xffff, %edx 1068 jnz LABEL(exit) 1069 1070# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1071 sub $16, %r11 1072 jbe LABEL(strcmp_exitz) 1073# endif 1074 1075 add $16, %rcx 1076 movdqa %xmm4, %xmm3 1077 1078 add $16, %r10 1079 jg LABEL(nibble_ashr_7) /* cross page boundary */ 1080 1081 movdqa (%rsi, %rcx), %xmm1 1082 movdqa (%rdi, %rcx), %xmm2 1083 movdqa %xmm2, %xmm4 1084 1085 psrldq $7, %xmm3 1086 pslldq $9, %xmm2 1087 por %xmm3, %xmm2 /* merge into one 16byte value */ 1088 1089 TOLOWER (%xmm1, %xmm2) 1090 1091 pcmpeqb %xmm1, %xmm0 1092 pcmpeqb %xmm2, %xmm1 1093 psubb %xmm0, %xmm1 1094 pmovmskb %xmm1, %edx 1095 sub $0xffff, %edx 1096 jnz LABEL(exit) 1097 1098# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1099 sub $16, %r11 1100 jbe LABEL(strcmp_exitz) 1101# endif 1102 1103 add $16, %rcx 1104 movdqa %xmm4, %xmm3 1105 jmp LABEL(loop_ashr_7) 1106 1107 .p2align 4 1108LABEL(nibble_ashr_7): 1109 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1110 pmovmskb %xmm0, %edx 1111 test $0xff80, %edx 1112 jnz LABEL(ashr_7_exittail) 1113 1114# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1115 cmp $9, %r11 1116 jbe LABEL(ashr_7_exittail) 1117# endif 1118 1119 pxor %xmm0, %xmm0 1120 sub $0x1000, %r10 1121 jmp LABEL(gobble_ashr_7) 1122 1123 .p2align 4 1124LABEL(ashr_7_exittail): 1125 movdqa (%rsi, %rcx), %xmm1 1126 psrldq $7, %xmm0 1127 psrldq $7, %xmm3 1128 jmp LABEL(aftertail) 1129 1130/* 1131 * The following cases will be handled by ashr_8 1132 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1133 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 1134 */ 1135 .p2align 4 1136LABEL(ashr_8): 1137 pxor %xmm0, %xmm0 1138 movdqa (%rdi), %xmm2 1139 movdqa (%rsi), %xmm1 1140 pcmpeqb %xmm1, %xmm0 1141 pslldq $8, %xmm2 1142 TOLOWER (%xmm1, %xmm2) 1143 pcmpeqb %xmm1, %xmm2 1144 psubb %xmm0, %xmm2 1145 pmovmskb %xmm2, %r9d 1146 shr %cl, %edx 1147 shr %cl, %r9d 1148 sub %r9d, %edx 1149 jnz LABEL(less32bytes) 1150 movdqa (%rdi), %xmm3 1151 1152 UPDATE_STRNCMP_COUNTER 1153 1154 pxor %xmm0, %xmm0 1155 mov $16, %rcx /* index for loads */ 1156 mov $8, %r9d /* byte position left over from less32bytes case */ 1157 /* 1158 * Setup %r10 value allows us to detect crossing a page boundary. 1159 * When %r10 goes positive we have crossed a page boundary and 1160 * need to do a nibble. 1161 */ 1162 lea 8(%rdi), %r10 1163 and $0xfff, %r10 /* offset into 4K page */ 1164 sub $0x1000, %r10 /* subtract 4K pagesize */ 1165 1166 .p2align 4 1167LABEL(loop_ashr_8): 1168 add $16, %r10 1169 jg LABEL(nibble_ashr_8) 1170 1171LABEL(gobble_ashr_8): 1172 movdqa (%rsi, %rcx), %xmm1 1173 movdqa (%rdi, %rcx), %xmm2 1174 movdqa %xmm2, %xmm4 1175 1176 psrldq $8, %xmm3 1177 pslldq $8, %xmm2 1178 por %xmm3, %xmm2 /* merge into one 16byte value */ 1179 1180 TOLOWER (%xmm1, %xmm2) 1181 1182 pcmpeqb %xmm1, %xmm0 1183 pcmpeqb %xmm2, %xmm1 1184 psubb %xmm0, %xmm1 1185 pmovmskb %xmm1, %edx 1186 sub $0xffff, %edx 1187 jnz LABEL(exit) 1188 1189# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1190 sub $16, %r11 1191 jbe LABEL(strcmp_exitz) 1192# endif 1193 1194 add $16, %rcx 1195 movdqa %xmm4, %xmm3 1196 1197 add $16, %r10 1198 jg LABEL(nibble_ashr_8) /* cross page boundary */ 1199 1200 movdqa (%rsi, %rcx), %xmm1 1201 movdqa (%rdi, %rcx), %xmm2 1202 movdqa %xmm2, %xmm4 1203 1204 psrldq $8, %xmm3 1205 pslldq $8, %xmm2 1206 por %xmm3, %xmm2 /* merge into one 16byte value */ 1207 1208 TOLOWER (%xmm1, %xmm2) 1209 1210 pcmpeqb %xmm1, %xmm0 1211 pcmpeqb %xmm2, %xmm1 1212 psubb %xmm0, %xmm1 1213 pmovmskb %xmm1, %edx 1214 sub $0xffff, %edx 1215 jnz LABEL(exit) 1216 1217# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1218 sub $16, %r11 1219 jbe LABEL(strcmp_exitz) 1220# endif 1221 1222 add $16, %rcx 1223 movdqa %xmm4, %xmm3 1224 jmp LABEL(loop_ashr_8) 1225 1226 .p2align 4 1227LABEL(nibble_ashr_8): 1228 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1229 pmovmskb %xmm0, %edx 1230 test $0xff00, %edx 1231 jnz LABEL(ashr_8_exittail) 1232 1233# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1234 cmp $8, %r11 1235 jbe LABEL(ashr_8_exittail) 1236# endif 1237 1238 pxor %xmm0, %xmm0 1239 sub $0x1000, %r10 1240 jmp LABEL(gobble_ashr_8) 1241 1242 .p2align 4 1243LABEL(ashr_8_exittail): 1244 movdqa (%rsi, %rcx), %xmm1 1245 psrldq $8, %xmm0 1246 psrldq $8, %xmm3 1247 jmp LABEL(aftertail) 1248 1249/* 1250 * The following cases will be handled by ashr_9 1251 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1252 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 1253 */ 1254 .p2align 4 1255LABEL(ashr_9): 1256 pxor %xmm0, %xmm0 1257 movdqa (%rdi), %xmm2 1258 movdqa (%rsi), %xmm1 1259 pcmpeqb %xmm1, %xmm0 1260 pslldq $7, %xmm2 1261 TOLOWER (%xmm1, %xmm2) 1262 pcmpeqb %xmm1, %xmm2 1263 psubb %xmm0, %xmm2 1264 pmovmskb %xmm2, %r9d 1265 shr %cl, %edx 1266 shr %cl, %r9d 1267 sub %r9d, %edx 1268 jnz LABEL(less32bytes) 1269 movdqa (%rdi), %xmm3 1270 1271 UPDATE_STRNCMP_COUNTER 1272 1273 pxor %xmm0, %xmm0 1274 mov $16, %rcx /* index for loads */ 1275 mov $9, %r9d /* byte position left over from less32bytes case */ 1276 /* 1277 * Setup %r10 value allows us to detect crossing a page boundary. 1278 * When %r10 goes positive we have crossed a page boundary and 1279 * need to do a nibble. 1280 */ 1281 lea 9(%rdi), %r10 1282 and $0xfff, %r10 /* offset into 4K page */ 1283 sub $0x1000, %r10 /* subtract 4K pagesize */ 1284 1285 .p2align 4 1286LABEL(loop_ashr_9): 1287 add $16, %r10 1288 jg LABEL(nibble_ashr_9) 1289 1290LABEL(gobble_ashr_9): 1291 movdqa (%rsi, %rcx), %xmm1 1292 movdqa (%rdi, %rcx), %xmm2 1293 movdqa %xmm2, %xmm4 1294 1295 psrldq $9, %xmm3 1296 pslldq $7, %xmm2 1297 por %xmm3, %xmm2 /* merge into one 16byte value */ 1298 1299 TOLOWER (%xmm1, %xmm2) 1300 1301 pcmpeqb %xmm1, %xmm0 1302 pcmpeqb %xmm2, %xmm1 1303 psubb %xmm0, %xmm1 1304 pmovmskb %xmm1, %edx 1305 sub $0xffff, %edx 1306 jnz LABEL(exit) 1307 1308# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1309 sub $16, %r11 1310 jbe LABEL(strcmp_exitz) 1311# endif 1312 1313 add $16, %rcx 1314 movdqa %xmm4, %xmm3 1315 1316 add $16, %r10 1317 jg LABEL(nibble_ashr_9) /* cross page boundary */ 1318 1319 movdqa (%rsi, %rcx), %xmm1 1320 movdqa (%rdi, %rcx), %xmm2 1321 movdqa %xmm2, %xmm4 1322 1323 psrldq $9, %xmm3 1324 pslldq $7, %xmm2 1325 por %xmm3, %xmm2 /* merge into one 16byte value */ 1326 1327 TOLOWER (%xmm1, %xmm2) 1328 1329 pcmpeqb %xmm1, %xmm0 1330 pcmpeqb %xmm2, %xmm1 1331 psubb %xmm0, %xmm1 1332 pmovmskb %xmm1, %edx 1333 sub $0xffff, %edx 1334 jnz LABEL(exit) 1335 1336# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1337 sub $16, %r11 1338 jbe LABEL(strcmp_exitz) 1339# endif 1340 1341 add $16, %rcx 1342 movdqa %xmm4, %xmm3 /* store for next cycle */ 1343 jmp LABEL(loop_ashr_9) 1344 1345 .p2align 4 1346LABEL(nibble_ashr_9): 1347 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1348 pmovmskb %xmm0, %edx 1349 test $0xfe00, %edx 1350 jnz LABEL(ashr_9_exittail) 1351 1352# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1353 cmp $7, %r11 1354 jbe LABEL(ashr_9_exittail) 1355# endif 1356 1357 pxor %xmm0, %xmm0 1358 sub $0x1000, %r10 1359 jmp LABEL(gobble_ashr_9) 1360 1361 .p2align 4 1362LABEL(ashr_9_exittail): 1363 movdqa (%rsi, %rcx), %xmm1 1364 psrldq $9, %xmm0 1365 psrldq $9, %xmm3 1366 jmp LABEL(aftertail) 1367 1368/* 1369 * The following cases will be handled by ashr_10 1370 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1371 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 1372 */ 1373 .p2align 4 1374LABEL(ashr_10): 1375 pxor %xmm0, %xmm0 1376 movdqa (%rdi), %xmm2 1377 movdqa (%rsi), %xmm1 1378 pcmpeqb %xmm1, %xmm0 1379 pslldq $6, %xmm2 1380 TOLOWER (%xmm1, %xmm2) 1381 pcmpeqb %xmm1, %xmm2 1382 psubb %xmm0, %xmm2 1383 pmovmskb %xmm2, %r9d 1384 shr %cl, %edx 1385 shr %cl, %r9d 1386 sub %r9d, %edx 1387 jnz LABEL(less32bytes) 1388 movdqa (%rdi), %xmm3 1389 1390 UPDATE_STRNCMP_COUNTER 1391 1392 pxor %xmm0, %xmm0 1393 mov $16, %rcx /* index for loads */ 1394 mov $10, %r9d /* byte position left over from less32bytes case */ 1395 /* 1396 * Setup %r10 value allows us to detect crossing a page boundary. 1397 * When %r10 goes positive we have crossed a page boundary and 1398 * need to do a nibble. 1399 */ 1400 lea 10(%rdi), %r10 1401 and $0xfff, %r10 /* offset into 4K page */ 1402 sub $0x1000, %r10 /* subtract 4K pagesize */ 1403 1404 .p2align 4 1405LABEL(loop_ashr_10): 1406 add $16, %r10 1407 jg LABEL(nibble_ashr_10) 1408 1409LABEL(gobble_ashr_10): 1410 movdqa (%rsi, %rcx), %xmm1 1411 movdqa (%rdi, %rcx), %xmm2 1412 movdqa %xmm2, %xmm4 1413 1414 psrldq $10, %xmm3 1415 pslldq $6, %xmm2 1416 por %xmm3, %xmm2 /* merge into one 16byte value */ 1417 1418 TOLOWER (%xmm1, %xmm2) 1419 1420 pcmpeqb %xmm1, %xmm0 1421 pcmpeqb %xmm2, %xmm1 1422 psubb %xmm0, %xmm1 1423 pmovmskb %xmm1, %edx 1424 sub $0xffff, %edx 1425 jnz LABEL(exit) 1426 1427# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1428 sub $16, %r11 1429 jbe LABEL(strcmp_exitz) 1430# endif 1431 1432 add $16, %rcx 1433 movdqa %xmm4, %xmm3 1434 1435 add $16, %r10 1436 jg LABEL(nibble_ashr_10) /* cross page boundary */ 1437 1438 movdqa (%rsi, %rcx), %xmm1 1439 movdqa (%rdi, %rcx), %xmm2 1440 movdqa %xmm2, %xmm4 1441 1442 psrldq $10, %xmm3 1443 pslldq $6, %xmm2 1444 por %xmm3, %xmm2 /* merge into one 16byte value */ 1445 1446 TOLOWER (%xmm1, %xmm2) 1447 1448 pcmpeqb %xmm1, %xmm0 1449 pcmpeqb %xmm2, %xmm1 1450 psubb %xmm0, %xmm1 1451 pmovmskb %xmm1, %edx 1452 sub $0xffff, %edx 1453 jnz LABEL(exit) 1454 1455# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1456 sub $16, %r11 1457 jbe LABEL(strcmp_exitz) 1458# endif 1459 1460 add $16, %rcx 1461 movdqa %xmm4, %xmm3 1462 jmp LABEL(loop_ashr_10) 1463 1464 .p2align 4 1465LABEL(nibble_ashr_10): 1466 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1467 pmovmskb %xmm0, %edx 1468 test $0xfc00, %edx 1469 jnz LABEL(ashr_10_exittail) 1470 1471# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1472 cmp $6, %r11 1473 jbe LABEL(ashr_10_exittail) 1474# endif 1475 1476 pxor %xmm0, %xmm0 1477 sub $0x1000, %r10 1478 jmp LABEL(gobble_ashr_10) 1479 1480 .p2align 4 1481LABEL(ashr_10_exittail): 1482 movdqa (%rsi, %rcx), %xmm1 1483 psrldq $10, %xmm0 1484 psrldq $10, %xmm3 1485 jmp LABEL(aftertail) 1486 1487/* 1488 * The following cases will be handled by ashr_11 1489 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1490 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 1491 */ 1492 .p2align 4 1493LABEL(ashr_11): 1494 pxor %xmm0, %xmm0 1495 movdqa (%rdi), %xmm2 1496 movdqa (%rsi), %xmm1 1497 pcmpeqb %xmm1, %xmm0 1498 pslldq $5, %xmm2 1499 TOLOWER (%xmm1, %xmm2) 1500 pcmpeqb %xmm1, %xmm2 1501 psubb %xmm0, %xmm2 1502 pmovmskb %xmm2, %r9d 1503 shr %cl, %edx 1504 shr %cl, %r9d 1505 sub %r9d, %edx 1506 jnz LABEL(less32bytes) 1507 movdqa (%rdi), %xmm3 1508 1509 UPDATE_STRNCMP_COUNTER 1510 1511 pxor %xmm0, %xmm0 1512 mov $16, %rcx /* index for loads */ 1513 mov $11, %r9d /* byte position left over from less32bytes case */ 1514 /* 1515 * Setup %r10 value allows us to detect crossing a page boundary. 1516 * When %r10 goes positive we have crossed a page boundary and 1517 * need to do a nibble. 1518 */ 1519 lea 11(%rdi), %r10 1520 and $0xfff, %r10 /* offset into 4K page */ 1521 sub $0x1000, %r10 /* subtract 4K pagesize */ 1522 1523 .p2align 4 1524LABEL(loop_ashr_11): 1525 add $16, %r10 1526 jg LABEL(nibble_ashr_11) 1527 1528LABEL(gobble_ashr_11): 1529 movdqa (%rsi, %rcx), %xmm1 1530 movdqa (%rdi, %rcx), %xmm2 1531 movdqa %xmm2, %xmm4 1532 1533 psrldq $11, %xmm3 1534 pslldq $5, %xmm2 1535 por %xmm3, %xmm2 /* merge into one 16byte value */ 1536 1537 TOLOWER (%xmm1, %xmm2) 1538 1539 pcmpeqb %xmm1, %xmm0 1540 pcmpeqb %xmm2, %xmm1 1541 psubb %xmm0, %xmm1 1542 pmovmskb %xmm1, %edx 1543 sub $0xffff, %edx 1544 jnz LABEL(exit) 1545 1546# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1547 sub $16, %r11 1548 jbe LABEL(strcmp_exitz) 1549# endif 1550 1551 add $16, %rcx 1552 movdqa %xmm4, %xmm3 1553 1554 add $16, %r10 1555 jg LABEL(nibble_ashr_11) /* cross page boundary */ 1556 1557 movdqa (%rsi, %rcx), %xmm1 1558 movdqa (%rdi, %rcx), %xmm2 1559 movdqa %xmm2, %xmm4 1560 1561 psrldq $11, %xmm3 1562 pslldq $5, %xmm2 1563 por %xmm3, %xmm2 /* merge into one 16byte value */ 1564 1565 TOLOWER (%xmm1, %xmm2) 1566 1567 pcmpeqb %xmm1, %xmm0 1568 pcmpeqb %xmm2, %xmm1 1569 psubb %xmm0, %xmm1 1570 pmovmskb %xmm1, %edx 1571 sub $0xffff, %edx 1572 jnz LABEL(exit) 1573 1574# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1575 sub $16, %r11 1576 jbe LABEL(strcmp_exitz) 1577# endif 1578 1579 add $16, %rcx 1580 movdqa %xmm4, %xmm3 1581 jmp LABEL(loop_ashr_11) 1582 1583 .p2align 4 1584LABEL(nibble_ashr_11): 1585 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1586 pmovmskb %xmm0, %edx 1587 test $0xf800, %edx 1588 jnz LABEL(ashr_11_exittail) 1589 1590# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1591 cmp $5, %r11 1592 jbe LABEL(ashr_11_exittail) 1593# endif 1594 1595 pxor %xmm0, %xmm0 1596 sub $0x1000, %r10 1597 jmp LABEL(gobble_ashr_11) 1598 1599 .p2align 4 1600LABEL(ashr_11_exittail): 1601 movdqa (%rsi, %rcx), %xmm1 1602 psrldq $11, %xmm0 1603 psrldq $11, %xmm3 1604 jmp LABEL(aftertail) 1605 1606/* 1607 * The following cases will be handled by ashr_12 1608 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1609 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 1610 */ 1611 .p2align 4 1612LABEL(ashr_12): 1613 pxor %xmm0, %xmm0 1614 movdqa (%rdi), %xmm2 1615 movdqa (%rsi), %xmm1 1616 pcmpeqb %xmm1, %xmm0 1617 pslldq $4, %xmm2 1618 TOLOWER (%xmm1, %xmm2) 1619 pcmpeqb %xmm1, %xmm2 1620 psubb %xmm0, %xmm2 1621 pmovmskb %xmm2, %r9d 1622 shr %cl, %edx 1623 shr %cl, %r9d 1624 sub %r9d, %edx 1625 jnz LABEL(less32bytes) 1626 movdqa (%rdi), %xmm3 1627 1628 UPDATE_STRNCMP_COUNTER 1629 1630 pxor %xmm0, %xmm0 1631 mov $16, %rcx /* index for loads */ 1632 mov $12, %r9d /* byte position left over from less32bytes case */ 1633 /* 1634 * Setup %r10 value allows us to detect crossing a page boundary. 1635 * When %r10 goes positive we have crossed a page boundary and 1636 * need to do a nibble. 1637 */ 1638 lea 12(%rdi), %r10 1639 and $0xfff, %r10 /* offset into 4K page */ 1640 sub $0x1000, %r10 /* subtract 4K pagesize */ 1641 1642 .p2align 4 1643LABEL(loop_ashr_12): 1644 add $16, %r10 1645 jg LABEL(nibble_ashr_12) 1646 1647LABEL(gobble_ashr_12): 1648 movdqa (%rsi, %rcx), %xmm1 1649 movdqa (%rdi, %rcx), %xmm2 1650 movdqa %xmm2, %xmm4 1651 1652 psrldq $12, %xmm3 1653 pslldq $4, %xmm2 1654 por %xmm3, %xmm2 /* merge into one 16byte value */ 1655 1656 TOLOWER (%xmm1, %xmm2) 1657 1658 pcmpeqb %xmm1, %xmm0 1659 pcmpeqb %xmm2, %xmm1 1660 psubb %xmm0, %xmm1 1661 pmovmskb %xmm1, %edx 1662 sub $0xffff, %edx 1663 jnz LABEL(exit) 1664 1665# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1666 sub $16, %r11 1667 jbe LABEL(strcmp_exitz) 1668# endif 1669 1670 add $16, %rcx 1671 movdqa %xmm4, %xmm3 1672 1673 add $16, %r10 1674 jg LABEL(nibble_ashr_12) /* cross page boundary */ 1675 1676 movdqa (%rsi, %rcx), %xmm1 1677 movdqa (%rdi, %rcx), %xmm2 1678 movdqa %xmm2, %xmm4 1679 1680 psrldq $12, %xmm3 1681 pslldq $4, %xmm2 1682 por %xmm3, %xmm2 /* merge into one 16byte value */ 1683 1684 TOLOWER (%xmm1, %xmm2) 1685 1686 pcmpeqb %xmm1, %xmm0 1687 pcmpeqb %xmm2, %xmm1 1688 psubb %xmm0, %xmm1 1689 pmovmskb %xmm1, %edx 1690 sub $0xffff, %edx 1691 jnz LABEL(exit) 1692 1693# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1694 sub $16, %r11 1695 jbe LABEL(strcmp_exitz) 1696# endif 1697 1698 add $16, %rcx 1699 movdqa %xmm4, %xmm3 1700 jmp LABEL(loop_ashr_12) 1701 1702 .p2align 4 1703LABEL(nibble_ashr_12): 1704 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1705 pmovmskb %xmm0, %edx 1706 test $0xf000, %edx 1707 jnz LABEL(ashr_12_exittail) 1708 1709# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1710 cmp $4, %r11 1711 jbe LABEL(ashr_12_exittail) 1712# endif 1713 1714 pxor %xmm0, %xmm0 1715 sub $0x1000, %r10 1716 jmp LABEL(gobble_ashr_12) 1717 1718 .p2align 4 1719LABEL(ashr_12_exittail): 1720 movdqa (%rsi, %rcx), %xmm1 1721 psrldq $12, %xmm0 1722 psrldq $12, %xmm3 1723 jmp LABEL(aftertail) 1724 1725/* 1726 * The following cases will be handled by ashr_13 1727 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1728 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 1729 */ 1730 .p2align 4 1731LABEL(ashr_13): 1732 pxor %xmm0, %xmm0 1733 movdqa (%rdi), %xmm2 1734 movdqa (%rsi), %xmm1 1735 pcmpeqb %xmm1, %xmm0 1736 pslldq $3, %xmm2 1737 TOLOWER (%xmm1, %xmm2) 1738 pcmpeqb %xmm1, %xmm2 1739 psubb %xmm0, %xmm2 1740 pmovmskb %xmm2, %r9d 1741 shr %cl, %edx 1742 shr %cl, %r9d 1743 sub %r9d, %edx 1744 jnz LABEL(less32bytes) 1745 movdqa (%rdi), %xmm3 1746 1747 UPDATE_STRNCMP_COUNTER 1748 1749 pxor %xmm0, %xmm0 1750 mov $16, %rcx /* index for loads */ 1751 mov $13, %r9d /* byte position left over from less32bytes case */ 1752 /* 1753 * Setup %r10 value allows us to detect crossing a page boundary. 1754 * When %r10 goes positive we have crossed a page boundary and 1755 * need to do a nibble. 1756 */ 1757 lea 13(%rdi), %r10 1758 and $0xfff, %r10 /* offset into 4K page */ 1759 sub $0x1000, %r10 /* subtract 4K pagesize */ 1760 1761 .p2align 4 1762LABEL(loop_ashr_13): 1763 add $16, %r10 1764 jg LABEL(nibble_ashr_13) 1765 1766LABEL(gobble_ashr_13): 1767 movdqa (%rsi, %rcx), %xmm1 1768 movdqa (%rdi, %rcx), %xmm2 1769 movdqa %xmm2, %xmm4 1770 1771 psrldq $13, %xmm3 1772 pslldq $3, %xmm2 1773 por %xmm3, %xmm2 /* merge into one 16byte value */ 1774 1775 TOLOWER (%xmm1, %xmm2) 1776 1777 pcmpeqb %xmm1, %xmm0 1778 pcmpeqb %xmm2, %xmm1 1779 psubb %xmm0, %xmm1 1780 pmovmskb %xmm1, %edx 1781 sub $0xffff, %edx 1782 jnz LABEL(exit) 1783 1784# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1785 sub $16, %r11 1786 jbe LABEL(strcmp_exitz) 1787# endif 1788 1789 add $16, %rcx 1790 movdqa %xmm4, %xmm3 1791 1792 add $16, %r10 1793 jg LABEL(nibble_ashr_13) /* cross page boundary */ 1794 1795 movdqa (%rsi, %rcx), %xmm1 1796 movdqa (%rdi, %rcx), %xmm2 1797 movdqa %xmm2, %xmm4 1798 1799 psrldq $13, %xmm3 1800 pslldq $3, %xmm2 1801 por %xmm3, %xmm2 /* merge into one 16byte value */ 1802 1803 TOLOWER (%xmm1, %xmm2) 1804 1805 pcmpeqb %xmm1, %xmm0 1806 pcmpeqb %xmm2, %xmm1 1807 psubb %xmm0, %xmm1 1808 pmovmskb %xmm1, %edx 1809 sub $0xffff, %edx 1810 jnz LABEL(exit) 1811 1812# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1813 sub $16, %r11 1814 jbe LABEL(strcmp_exitz) 1815# endif 1816 1817 add $16, %rcx 1818 movdqa %xmm4, %xmm3 1819 jmp LABEL(loop_ashr_13) 1820 1821 .p2align 4 1822LABEL(nibble_ashr_13): 1823 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1824 pmovmskb %xmm0, %edx 1825 test $0xe000, %edx 1826 jnz LABEL(ashr_13_exittail) 1827 1828# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1829 cmp $3, %r11 1830 jbe LABEL(ashr_13_exittail) 1831# endif 1832 1833 pxor %xmm0, %xmm0 1834 sub $0x1000, %r10 1835 jmp LABEL(gobble_ashr_13) 1836 1837 .p2align 4 1838LABEL(ashr_13_exittail): 1839 movdqa (%rsi, %rcx), %xmm1 1840 psrldq $13, %xmm0 1841 psrldq $13, %xmm3 1842 jmp LABEL(aftertail) 1843 1844/* 1845 * The following cases will be handled by ashr_14 1846 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1847 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 1848 */ 1849 .p2align 4 1850LABEL(ashr_14): 1851 pxor %xmm0, %xmm0 1852 movdqa (%rdi), %xmm2 1853 movdqa (%rsi), %xmm1 1854 pcmpeqb %xmm1, %xmm0 1855 pslldq $2, %xmm2 1856 TOLOWER (%xmm1, %xmm2) 1857 pcmpeqb %xmm1, %xmm2 1858 psubb %xmm0, %xmm2 1859 pmovmskb %xmm2, %r9d 1860 shr %cl, %edx 1861 shr %cl, %r9d 1862 sub %r9d, %edx 1863 jnz LABEL(less32bytes) 1864 movdqa (%rdi), %xmm3 1865 1866 UPDATE_STRNCMP_COUNTER 1867 1868 pxor %xmm0, %xmm0 1869 mov $16, %rcx /* index for loads */ 1870 mov $14, %r9d /* byte position left over from less32bytes case */ 1871 /* 1872 * Setup %r10 value allows us to detect crossing a page boundary. 1873 * When %r10 goes positive we have crossed a page boundary and 1874 * need to do a nibble. 1875 */ 1876 lea 14(%rdi), %r10 1877 and $0xfff, %r10 /* offset into 4K page */ 1878 sub $0x1000, %r10 /* subtract 4K pagesize */ 1879 1880 .p2align 4 1881LABEL(loop_ashr_14): 1882 add $16, %r10 1883 jg LABEL(nibble_ashr_14) 1884 1885LABEL(gobble_ashr_14): 1886 movdqa (%rsi, %rcx), %xmm1 1887 movdqa (%rdi, %rcx), %xmm2 1888 movdqa %xmm2, %xmm4 1889 1890 psrldq $14, %xmm3 1891 pslldq $2, %xmm2 1892 por %xmm3, %xmm2 /* merge into one 16byte value */ 1893 1894 TOLOWER (%xmm1, %xmm2) 1895 1896 pcmpeqb %xmm1, %xmm0 1897 pcmpeqb %xmm2, %xmm1 1898 psubb %xmm0, %xmm1 1899 pmovmskb %xmm1, %edx 1900 sub $0xffff, %edx 1901 jnz LABEL(exit) 1902 1903# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1904 sub $16, %r11 1905 jbe LABEL(strcmp_exitz) 1906# endif 1907 1908 add $16, %rcx 1909 movdqa %xmm4, %xmm3 1910 1911 add $16, %r10 1912 jg LABEL(nibble_ashr_14) /* cross page boundary */ 1913 1914 movdqa (%rsi, %rcx), %xmm1 1915 movdqa (%rdi, %rcx), %xmm2 1916 movdqa %xmm2, %xmm4 1917 1918 psrldq $14, %xmm3 1919 pslldq $2, %xmm2 1920 por %xmm3, %xmm2 /* merge into one 16byte value */ 1921 1922 TOLOWER (%xmm1, %xmm2) 1923 1924 pcmpeqb %xmm1, %xmm0 1925 pcmpeqb %xmm2, %xmm1 1926 psubb %xmm0, %xmm1 1927 pmovmskb %xmm1, %edx 1928 sub $0xffff, %edx 1929 jnz LABEL(exit) 1930 1931# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L 1932 sub $16, %r11 1933 jbe LABEL(strcmp_exitz) 1934# endif 1935 1936 add $16, %rcx 1937 movdqa %xmm4, %xmm3 1938 jmp LABEL(loop_ashr_14) 1939 1940 .p2align 4 1941LABEL(nibble_ashr_14): 1942 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1943 pmovmskb %xmm0, %edx 1944 test $0xc000, %edx 1945 jnz LABEL(ashr_14_exittail) 1946 1947# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1948 cmp $2, %r11 1949 jbe LABEL(ashr_14_exittail) 1950# endif 1951 1952 pxor %xmm0, %xmm0 1953 sub $0x1000, %r10 1954 jmp LABEL(gobble_ashr_14) 1955 1956 .p2align 4 1957LABEL(ashr_14_exittail): 1958 movdqa (%rsi, %rcx), %xmm1 1959 psrldq $14, %xmm0 1960 psrldq $14, %xmm3 1961 jmp LABEL(aftertail) 1962 1963/* 1964 * The following cases will be handled by ashr_15 1965 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1966 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 1967 */ 1968 .p2align 4 1969LABEL(ashr_15): 1970 pxor %xmm0, %xmm0 1971 movdqa (%rdi), %xmm2 1972 movdqa (%rsi), %xmm1 1973 pcmpeqb %xmm1, %xmm0 1974 pslldq $1, %xmm2 1975 TOLOWER (%xmm1, %xmm2) 1976 pcmpeqb %xmm1, %xmm2 1977 psubb %xmm0, %xmm2 1978 pmovmskb %xmm2, %r9d 1979 shr %cl, %edx 1980 shr %cl, %r9d 1981 sub %r9d, %edx 1982 jnz LABEL(less32bytes) 1983 1984 movdqa (%rdi), %xmm3 1985 1986 UPDATE_STRNCMP_COUNTER 1987 1988 pxor %xmm0, %xmm0 1989 mov $16, %rcx /* index for loads */ 1990 mov $15, %r9d /* byte position left over from less32bytes case */ 1991 /* 1992 * Setup %r10 value allows us to detect crossing a page boundary. 1993 * When %r10 goes positive we have crossed a page boundary and 1994 * need to do a nibble. 1995 */ 1996 lea 15(%rdi), %r10 1997 and $0xfff, %r10 /* offset into 4K page */ 1998 1999 sub $0x1000, %r10 /* subtract 4K pagesize */ 2000 2001 .p2align 4 2002LABEL(loop_ashr_15): 2003 add $16, %r10 2004 jg LABEL(nibble_ashr_15) 2005 2006LABEL(gobble_ashr_15): 2007 movdqa (%rsi, %rcx), %xmm1 2008 movdqa (%rdi, %rcx), %xmm2 2009 movdqa %xmm2, %xmm4 2010 2011 psrldq $15, %xmm3 2012 pslldq $1, %xmm2 2013 por %xmm3, %xmm2 /* merge into one 16byte value */ 2014 2015 TOLOWER (%xmm1, %xmm2) 2016 2017 pcmpeqb %xmm1, %xmm0 2018 pcmpeqb %xmm2, %xmm1 2019 psubb %xmm0, %xmm1 2020 pmovmskb %xmm1, %edx 2021 sub $0xffff, %edx 2022 jnz LABEL(exit) 2023 2024# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2025 sub $16, %r11 2026 jbe LABEL(strcmp_exitz) 2027# endif 2028 2029 add $16, %rcx 2030 movdqa %xmm4, %xmm3 2031 2032 add $16, %r10 2033 jg LABEL(nibble_ashr_15) /* cross page boundary */ 2034 2035 movdqa (%rsi, %rcx), %xmm1 2036 movdqa (%rdi, %rcx), %xmm2 2037 movdqa %xmm2, %xmm4 2038 2039 psrldq $15, %xmm3 2040 pslldq $1, %xmm2 2041 por %xmm3, %xmm2 /* merge into one 16byte value */ 2042 2043 TOLOWER (%xmm1, %xmm2) 2044 2045 pcmpeqb %xmm1, %xmm0 2046 pcmpeqb %xmm2, %xmm1 2047 psubb %xmm0, %xmm1 2048 pmovmskb %xmm1, %edx 2049 sub $0xffff, %edx 2050 jnz LABEL(exit) 2051 2052# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2053 sub $16, %r11 2054 jbe LABEL(strcmp_exitz) 2055# endif 2056 2057 add $16, %rcx 2058 movdqa %xmm4, %xmm3 2059 jmp LABEL(loop_ashr_15) 2060 2061 .p2align 4 2062LABEL(nibble_ashr_15): 2063 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 2064 pmovmskb %xmm0, %edx 2065 test $0x8000, %edx 2066 jnz LABEL(ashr_15_exittail) 2067 2068# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2069 cmpq $1, %r11 2070 jbe LABEL(ashr_15_exittail) 2071# endif 2072 2073 pxor %xmm0, %xmm0 2074 sub $0x1000, %r10 2075 jmp LABEL(gobble_ashr_15) 2076 2077 .p2align 4 2078LABEL(ashr_15_exittail): 2079 movdqa (%rsi, %rcx), %xmm1 2080 psrldq $15, %xmm3 2081 psrldq $15, %xmm0 2082 2083 .p2align 4 2084LABEL(aftertail): 2085 TOLOWER (%xmm1, %xmm3) 2086 pcmpeqb %xmm3, %xmm1 2087 psubb %xmm0, %xmm1 2088 pmovmskb %xmm1, %edx 2089 not %edx 2090 2091 .p2align 4 2092LABEL(exit): 2093 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 2094LABEL(less32bytes): 2095 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 2096 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 2097 test %r8d, %r8d 2098 jz LABEL(ret) 2099 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 2100 2101 .p2align 4 2102LABEL(ret): 2103LABEL(less16bytes): 2104 bsf %rdx, %rdx /* find and store bit index in %rdx */ 2105 2106# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2107 sub %rdx, %r11 2108 jbe LABEL(strcmp_exitz) 2109# endif 2110 movzbl (%rsi, %rdx), %ecx 2111 movzbl (%rdi, %rdx), %eax 2112 2113# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 2114 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 2115 movl (%rdx,%rcx,4), %ecx 2116 movl (%rdx,%rax,4), %eax 2117# endif 2118 2119 sub %ecx, %eax 2120 ret 2121 2122LABEL(strcmp_exitz): 2123 xor %eax, %eax 2124 ret 2125 2126 .p2align 4 2127LABEL(Byte0): 2128 movzbl (%rsi), %ecx 2129 movzbl (%rdi), %eax 2130 2131# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 2132 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 2133 movl (%rdx,%rcx,4), %ecx 2134 movl (%rdx,%rax,4), %eax 2135# endif 2136 2137 sub %ecx, %eax 2138 ret 2139END (STRCMP) 2140 2141 .section .rodata,"a",@progbits 2142 .p2align 3 2143LABEL(unaligned_table): 2144 .int LABEL(ashr_1) - LABEL(unaligned_table) 2145 .int LABEL(ashr_2) - LABEL(unaligned_table) 2146 .int LABEL(ashr_3) - LABEL(unaligned_table) 2147 .int LABEL(ashr_4) - LABEL(unaligned_table) 2148 .int LABEL(ashr_5) - LABEL(unaligned_table) 2149 .int LABEL(ashr_6) - LABEL(unaligned_table) 2150 .int LABEL(ashr_7) - LABEL(unaligned_table) 2151 .int LABEL(ashr_8) - LABEL(unaligned_table) 2152 .int LABEL(ashr_9) - LABEL(unaligned_table) 2153 .int LABEL(ashr_10) - LABEL(unaligned_table) 2154 .int LABEL(ashr_11) - LABEL(unaligned_table) 2155 .int LABEL(ashr_12) - LABEL(unaligned_table) 2156 .int LABEL(ashr_13) - LABEL(unaligned_table) 2157 .int LABEL(ashr_14) - LABEL(unaligned_table) 2158 .int LABEL(ashr_15) - LABEL(unaligned_table) 2159 .int LABEL(ashr_0) - LABEL(unaligned_table) 2160#endif 2161