1/* memcmp with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19 20#include <isa-level.h> 21 22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation 23 so we need this to build for ISA V2 builds. */ 24#if ISA_SHOULD_BUILD (2) 25 26#include <sysdep.h> 27 28# ifndef MEMCMP 29# define MEMCMP __memcmp_sse2 30# endif 31 32# ifdef USE_AS_WMEMCMP 33# define PCMPEQ pcmpeqd 34# define CHAR_SIZE 4 35# define SIZE_OFFSET (0) 36# else 37# define PCMPEQ pcmpeqb 38# define CHAR_SIZE 1 39# endif 40 41# ifdef USE_AS_MEMCMPEQ 42# define SIZE_OFFSET (0) 43# define CHECK_CMP(x, y) subl x, y 44# else 45# ifndef SIZE_OFFSET 46# define SIZE_OFFSET (CHAR_PER_VEC * 2) 47# endif 48# define CHECK_CMP(x, y) cmpl x, y 49# endif 50 51# define VEC_SIZE 16 52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 53 54# ifndef MEMCMP 55# define MEMCMP memcmp 56# endif 57 58 .text 59ENTRY(MEMCMP) 60# ifdef __ILP32__ 61 /* Clear the upper 32 bits. */ 62 movl %edx, %edx 63# endif 64# ifdef USE_AS_WMEMCMP 65 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store 66 in ecx for code size. This is preferable to using `incw` as 67 it avoids partial register stalls on older hardware (pre 68 SnB). */ 69 movl $0xffff, %ecx 70# endif 71 cmpq $CHAR_PER_VEC, %rdx 72 ja L(more_1x_vec) 73 74# ifdef USE_AS_WMEMCMP 75 /* saves a byte of code keeping the fall through path n = [2, 4] 76 in the initial cache line. */ 77 decl %edx 78 jle L(cmp_0_1) 79 80 movq (%rsi), %xmm0 81 movq (%rdi), %xmm1 82 PCMPEQ %xmm0, %xmm1 83 pmovmskb %xmm1, %eax 84 subl %ecx, %eax 85 jnz L(ret_nonzero_vec_start_0) 86 87 movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 88 movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 89 PCMPEQ %xmm0, %xmm1 90 pmovmskb %xmm1, %eax 91 subl %ecx, %eax 92 jnz L(ret_nonzero_vec_end_0_adj) 93# else 94 cmpl $8, %edx 95 ja L(cmp_9_16) 96 97 cmpl $4, %edx 98 jb L(cmp_0_3) 99 100# ifdef USE_AS_MEMCMPEQ 101 movl (%rsi), %eax 102 subl (%rdi), %eax 103 104 movl -4(%rsi, %rdx), %esi 105 subl -4(%rdi, %rdx), %esi 106 107 orl %esi, %eax 108 ret 109# else 110 /* Combine comparisons for lo and hi 4-byte comparisons. */ 111 movl -4(%rsi, %rdx), %ecx 112 movl -4(%rdi, %rdx), %eax 113 shlq $32, %rcx 114 shlq $32, %rax 115 movl (%rsi), %esi 116 movl (%rdi), %edi 117 orq %rsi, %rcx 118 orq %rdi, %rax 119 /* Only compute proper return if not-equal. */ 120 cmpq %rcx, %rax 121 jnz L(ret_nonzero) 122 xorl %eax, %eax 123 ret 124# endif 125 126 .p2align 4,, 10 127L(cmp_9_16): 128# ifdef USE_AS_MEMCMPEQ 129 movq (%rsi), %rax 130 subq (%rdi), %rax 131 132 movq -8(%rsi, %rdx), %rcx 133 subq -8(%rdi, %rdx), %rcx 134 orq %rcx, %rax 135 /* Convert 64 bit -> 32 bit boolean (we should have made the ABI 136 return long). */ 137 setnz %cl 138 movzbl %cl, %eax 139# else 140 movq (%rsi), %rcx 141 movq (%rdi), %rax 142 /* Only compute proper return if not-equal. */ 143 cmpq %rcx, %rax 144 jnz L(ret_nonzero) 145 146 movq -8(%rsi, %rdx, CHAR_SIZE), %rcx 147 movq -8(%rdi, %rdx, CHAR_SIZE), %rax 148 /* Only compute proper return if not-equal. */ 149 cmpq %rcx, %rax 150 jnz L(ret_nonzero) 151 xorl %eax, %eax 152# endif 153# endif 154 ret 155 156 .p2align 4,, 8 157L(cmp_0_1): 158 /* Flag set by earlier comparison against 1. */ 159 jne L(cmp_0_0) 160# ifdef USE_AS_WMEMCMP 161 movl (%rdi), %ecx 162 xorl %edx, %edx 163 cmpl (%rsi), %ecx 164 je L(cmp_0_0) 165 setg %dl 166 leal -1(%rdx, %rdx), %eax 167# else 168 movzbl (%rdi), %eax 169 movzbl (%rsi), %ecx 170 subl %ecx, %eax 171# endif 172 ret 173 174 /* Fits in aligning bytes. */ 175L(cmp_0_0): 176 xorl %eax, %eax 177 ret 178 179# ifdef USE_AS_WMEMCMP 180 .p2align 4 181L(ret_nonzero_vec_start_0): 182 bsfl %eax, %eax 183 movl (%rdi, %rax), %ecx 184 xorl %edx, %edx 185 cmpl (%rsi, %rax), %ecx 186 /* NB: no partial register stall here because xorl zero idiom 187 above. */ 188 setg %dl 189 leal -1(%rdx, %rdx), %eax 190 ret 191# else 192 193# ifndef USE_AS_MEMCMPEQ 194 .p2align 4,, 14 195L(ret_nonzero): 196 /* Need to bswap to get proper return without branch. */ 197 bswapq %rcx 198 bswapq %rax 199 subq %rcx, %rax 200 sbbl %eax, %eax 201 orl $1, %eax 202 ret 203# endif 204 205 .p2align 4 206L(cmp_0_3): 207# ifdef USE_AS_MEMCMPEQ 208 /* No reason to add to dependency chain on rdx. Saving a the 209 bytes here doesn't change number of fetch blocks. */ 210 cmpl $1, %edx 211 jbe L(cmp_0_1) 212# else 213 /* We need the code size to prevent taking an extra fetch block. 214 */ 215 decl %edx 216 jle L(cmp_0_1) 217# endif 218 movzwl (%rsi), %ecx 219 movzwl (%rdi), %eax 220 221# ifdef USE_AS_MEMCMPEQ 222 subl %ecx, %eax 223 224 movzbl -1(%rsi, %rdx), %esi 225 movzbl -1(%rdi, %rdx), %edi 226 subl %edi, %esi 227 orl %esi, %eax 228# else 229 bswapl %ecx 230 bswapl %eax 231 232 /* Implicit right shift by one. We just need to displace the 233 sign bits. */ 234 shrl %ecx 235 shrl %eax 236 237 /* Eat a partial register stall here. Saves code stopping 238 L(cmp_0_3) from bleeding into the next fetch block and saves 239 an ALU. */ 240 movb (%rsi, %rdx), %cl 241 movzbl (%rdi, %rdx), %edi 242 orl %edi, %eax 243 subl %ecx, %eax 244# endif 245 ret 246# endif 247 248 .p2align 5 249L(more_1x_vec): 250# ifndef USE_AS_WMEMCMP 251 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store 252 in ecx for code size. This is preferable to using `incw` as 253 it avoids partial register stalls on older hardware (pre 254 SnB). */ 255 movl $0xffff, %ecx 256# endif 257 movups (%rsi), %xmm0 258 movups (%rdi), %xmm1 259 PCMPEQ %xmm0, %xmm1 260 pmovmskb %xmm1, %eax 261 subl %ecx, %eax 262 jnz L(ret_nonzero_vec_start_0) 263# if SIZE_OFFSET == 0 264 cmpq $(CHAR_PER_VEC * 2), %rdx 265# else 266 /* Offset rdx. Saves just enough code size to keep the 267 L(last_2x_vec) case and the non-zero return in a single 268 cache line. */ 269 subq $(CHAR_PER_VEC * 2), %rdx 270# endif 271 ja L(more_2x_vec) 272 273 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 274 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 275 PCMPEQ %xmm0, %xmm1 276 pmovmskb %xmm1, %eax 277 subl %ecx, %eax 278# ifndef USE_AS_MEMCMPEQ 279 /* Don't use `incw ax` as machines this code runs on are liable 280 to have partial register stall. */ 281 jnz L(ret_nonzero_vec_end_0) 282# else 283 /* Various return targets for memcmpeq. Will always be hot in 284 Icache and get short encoding. */ 285L(ret_nonzero_vec_start_1): 286L(ret_nonzero_vec_start_0): 287L(ret_nonzero_vec_end_0): 288# endif 289 ret 290 291# ifndef USE_AS_MEMCMPEQ 292# ifdef USE_AS_WMEMCMP 293 .p2align 4 294L(ret_nonzero_vec_end_0_adj): 295 addl $3, %edx 296# else 297 .p2align 4,, 8 298# endif 299L(ret_nonzero_vec_end_0): 300 bsfl %eax, %eax 301# ifdef USE_AS_WMEMCMP 302 leal (%rax, %rdx, CHAR_SIZE), %eax 303 movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx 304 xorl %edx, %edx 305 cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx 306 /* NB: no partial register stall here because xorl zero idiom 307 above. */ 308 setg %dl 309 leal -1(%rdx, %rdx), %eax 310# else 311 addl %edx, %eax 312 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx 313 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax 314 subl %ecx, %eax 315# endif 316 ret 317# ifndef USE_AS_WMEMCMP 318 .p2align 4,, 10 319L(ret_nonzero_vec_start_0): 320 bsfl %eax, %eax 321 movzbl (%rsi, %rax), %ecx 322 movzbl (%rdi, %rax), %eax 323 subl %ecx, %eax 324 ret 325# endif 326# else 327# endif 328 329 .p2align 5 330L(more_2x_vec): 331 movups (VEC_SIZE * 1)(%rsi), %xmm0 332 movups (VEC_SIZE * 1)(%rdi), %xmm1 333 PCMPEQ %xmm0, %xmm1 334 pmovmskb %xmm1, %eax 335 subl %ecx, %eax 336 jnz L(ret_nonzero_vec_start_1) 337 338 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx 339 jbe L(last_2x_vec) 340 341 cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx 342 ja L(more_8x_vec) 343 344 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. 345 This can harm performance if non-zero return in [65, 80] or 346 [97, 112] but helps performance otherwise. Generally zero- 347 return is hotter. */ 348 movups (VEC_SIZE * 2)(%rsi), %xmm0 349 movups (VEC_SIZE * 2)(%rdi), %xmm1 350 PCMPEQ %xmm0, %xmm1 351 movups (VEC_SIZE * 3)(%rsi), %xmm2 352 movups (VEC_SIZE * 3)(%rdi), %xmm3 353 PCMPEQ %xmm2, %xmm3 354 pand %xmm1, %xmm3 355 356 pmovmskb %xmm3, %eax 357 CHECK_CMP (%ecx, %eax) 358 jnz L(ret_nonzero_vec_start_2_3) 359 360 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx 361 jbe L(last_2x_vec) 362 363 movups (VEC_SIZE * 4)(%rsi), %xmm0 364 movups (VEC_SIZE * 4)(%rdi), %xmm1 365 PCMPEQ %xmm0, %xmm1 366 movups (VEC_SIZE * 5)(%rsi), %xmm2 367 movups (VEC_SIZE * 5)(%rdi), %xmm3 368 PCMPEQ %xmm2, %xmm3 369 pand %xmm1, %xmm3 370 371 pmovmskb %xmm3, %eax 372 CHECK_CMP (%ecx, %eax) 373# ifdef USE_AS_MEMCMPEQ 374 jz L(last_2x_vec) 375 ret 376# else 377 jnz L(ret_nonzero_vec_start_4_5) 378# endif 379 .p2align 4 380L(last_2x_vec): 381 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 382 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 383 PCMPEQ %xmm0, %xmm1 384 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 385 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 386 PCMPEQ %xmm2, %xmm3 387 pand %xmm1, %xmm3 388 pmovmskb %xmm3, %eax 389 subl %ecx, %eax 390# ifdef USE_AS_MEMCMPEQ 391 /* Various return targets for memcmpeq. Will always be hot in 392 Icache and get short encoding. */ 393L(ret_nonzero_vec_start_2_3): 394L(ret_nonzero_vec_start_4_5): 395 ret 396# else 397 jnz L(ret_nonzero_vec_end_1) 398 ret 399 400 .p2align 4,, 8 401L(ret_nonzero_vec_end_1): 402 pmovmskb %xmm1, %ecx 403 /* High 16 bits of eax guranteed to be all ones. Rotate them in 404 to we can do `or + not` with just `xor`. */ 405 rorl $16, %eax 406 xorl %ecx, %eax 407 /* Partial register stall. */ 408 409 bsfl %eax, %eax 410# ifdef USE_AS_WMEMCMP 411 leal (%rax, %rdx, CHAR_SIZE), %eax 412 movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx 413 xorl %edx, %edx 414 cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx 415 /* NB: no partial register stall here because xorl zero idiom 416 above. */ 417 setg %dl 418 leal -1(%rdx, %rdx), %eax 419# else 420 addl %edx, %eax 421 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx 422 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax 423 subl %ecx, %eax 424# endif 425 ret 426 427 .p2align 4 428L(ret_nonzero_vec_start_4_5): 429 pmovmskb %xmm1, %edx 430 sall $16, %eax 431 leal 1(%rax, %rdx), %eax 432 bsfl %eax, %eax 433# ifdef USE_AS_WMEMCMP 434 movl (VEC_SIZE * 4)(%rdi, %rax), %ecx 435 xorl %edx, %edx 436 cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx 437 /* NB: no partial register stall here because xorl zero idiom 438 above. */ 439 setg %dl 440 leal -1(%rdx, %rdx), %eax 441# else 442 movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx 443 movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax 444 subl %ecx, %eax 445# endif 446 ret 447 448 .p2align 4,, 8 449L(ret_nonzero_vec_start_1): 450 bsfl %eax, %eax 451# ifdef USE_AS_WMEMCMP 452 movl (VEC_SIZE * 1)(%rdi, %rax), %ecx 453 xorl %edx, %edx 454 cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx 455 /* NB: no partial register stall here because xorl zero idiom 456 above. */ 457 setg %dl 458 leal -1(%rdx, %rdx), %eax 459# else 460 movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx 461 movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax 462 subl %ecx, %eax 463# endif 464 ret 465# endif 466 467 .p2align 4 468L(more_8x_vec): 469 subq %rdi, %rsi 470 leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx 471 andq $(VEC_SIZE * -1), %rdi 472 addq %rdi, %rsi 473 .p2align 4 474L(loop_4x): 475 movups (VEC_SIZE * 2)(%rsi), %xmm0 476 movups (VEC_SIZE * 3)(%rsi), %xmm1 477 478 PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 479 PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 480 481 movups (VEC_SIZE * 4)(%rsi), %xmm2 482 movups (VEC_SIZE * 5)(%rsi), %xmm3 483 484 PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 485 PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 486 487 pand %xmm0, %xmm1 488 pand %xmm2, %xmm3 489 pand %xmm1, %xmm3 490 491 pmovmskb %xmm3, %eax 492 subl %ecx, %eax 493 jnz L(ret_nonzero_loop) 494 495 addq $(VEC_SIZE * 4), %rdi 496 addq $(VEC_SIZE * 4), %rsi 497 cmpq %rdi, %rdx 498 ja L(loop_4x) 499 /* Get remaining length in edx. */ 500 subl %edi, %edx 501 /* Restore offset so we can reuse L(last_2x_vec). */ 502 addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx 503# ifdef USE_AS_WMEMCMP 504 shrl $2, %edx 505# endif 506 cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx 507 jbe L(last_2x_vec) 508 509 510 movups (VEC_SIZE * 2)(%rsi), %xmm0 511 movups (VEC_SIZE * 2)(%rdi), %xmm1 512 PCMPEQ %xmm0, %xmm1 513 movups (VEC_SIZE * 3)(%rsi), %xmm2 514 movups (VEC_SIZE * 3)(%rdi), %xmm3 515 PCMPEQ %xmm2, %xmm3 516 pand %xmm1, %xmm3 517 518 pmovmskb %xmm3, %eax 519 CHECK_CMP (%ecx, %eax) 520 jz L(last_2x_vec) 521# ifdef USE_AS_MEMCMPEQ 522L(ret_nonzero_loop): 523 ret 524# else 525 526 .p2align 4 527L(ret_nonzero_vec_start_2_3): 528 pmovmskb %xmm1, %edx 529 sall $16, %eax 530 leal 1(%rax, %rdx), %eax 531 532 bsfl %eax, %eax 533# ifdef USE_AS_WMEMCMP 534 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx 535 xorl %edx, %edx 536 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx 537 /* NB: no partial register stall here because xorl zero idiom 538 above. */ 539 setg %dl 540 leal -1(%rdx, %rdx), %eax 541# else 542 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 543 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 544 subl %ecx, %eax 545# endif 546 ret 547 548 .p2align 4 549L(ret_nonzero_loop): 550 pmovmskb %xmm0, %ecx 551 pmovmskb %xmm1, %edx 552 sall $(VEC_SIZE * 1), %edx 553 leal 1(%rcx, %rdx), %edx 554 pmovmskb %xmm2, %ecx 555 /* High 16 bits of eax guranteed to be all ones. Rotate them in 556 to we can do `or + not` with just `xor`. */ 557 rorl $16, %eax 558 xorl %ecx, %eax 559 560 salq $32, %rax 561 orq %rdx, %rax 562 563 bsfq %rax, %rax 564# ifdef USE_AS_WMEMCMP 565 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx 566 xorl %edx, %edx 567 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx 568 /* NB: no partial register stall here because xorl zero idiom 569 above. */ 570 setg %dl 571 leal -1(%rdx, %rdx), %eax 572# else 573 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 574 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 575 subl %ecx, %eax 576# endif 577 ret 578# endif 579END(MEMCMP) 580#endif 581