1/* memcmp/wmemcmp optimized with AVX2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23/* memcmp/wmemcmp is implemented as: 24 1. Use ymm vector compares when possible. The only case where 25 vector compares is not possible for when size < VEC_SIZE 26 and loading from either s1 or s2 would cause a page cross. 27 2. For size from 2 to 7 bytes on page cross, load as big endian 28 with movbe and bswap to avoid branches. 29 3. Use xmm vector compare when size >= 4 bytes for memcmp or 30 size >= 8 bytes for wmemcmp. 31 4. Optimistically compare up to first 4 * VEC_SIZE one at a 32 to check for early mismatches. Only do this if its guranteed the 33 work is not wasted. 34 5. If size is 8 * VEC_SIZE or less, unroll the loop. 35 6. Compare 4 * VEC_SIZE at a time with the aligned first memory 36 area. 37 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. 38 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. 39 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ 40 41 42# include <sysdep.h> 43 44# ifndef MEMCMP 45# define MEMCMP __memcmp_avx2_movbe 46# endif 47 48# ifdef USE_AS_WMEMCMP 49# define CHAR_SIZE 4 50# define VPCMPEQ vpcmpeqd 51# else 52# define CHAR_SIZE 1 53# define VPCMPEQ vpcmpeqb 54# endif 55 56# ifndef VZEROUPPER 57# define VZEROUPPER vzeroupper 58# endif 59 60# ifndef SECTION 61# define SECTION(p) p##.avx 62# endif 63 64# define VEC_SIZE 32 65# define PAGE_SIZE 4096 66 67/* Warning! 68 wmemcmp has to use SIGNED comparison for elements. 69 memcmp has to use UNSIGNED comparison for elemnts. 70*/ 71 72 .section SECTION(.text),"ax",@progbits 73ENTRY (MEMCMP) 74# ifdef USE_AS_WMEMCMP 75 shl $2, %RDX_LP 76# elif defined __ILP32__ 77 /* Clear the upper 32 bits. */ 78 movl %edx, %edx 79# endif 80 cmp $VEC_SIZE, %RDX_LP 81 jb L(less_vec) 82 83 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 84 vmovdqu (%rsi), %ymm1 85 VPCMPEQ (%rdi), %ymm1, %ymm1 86 vpmovmskb %ymm1, %eax 87 /* NB: eax must be destination register if going to 88 L(return_vec_[0,2]). For L(return_vec_3 destination register 89 must be ecx. */ 90 incl %eax 91 jnz L(return_vec_0) 92 93 cmpq $(VEC_SIZE * 2), %rdx 94 jbe L(last_1x_vec) 95 96 /* Check second VEC no matter what. */ 97 vmovdqu VEC_SIZE(%rsi), %ymm2 98 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 99 vpmovmskb %ymm2, %eax 100 /* If all 4 VEC where equal eax will be all 1s so incl will 101 overflow and set zero flag. */ 102 incl %eax 103 jnz L(return_vec_1) 104 105 /* Less than 4 * VEC. */ 106 cmpq $(VEC_SIZE * 4), %rdx 107 jbe L(last_2x_vec) 108 109 /* Check third and fourth VEC no matter what. */ 110 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 111 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 112 vpmovmskb %ymm3, %eax 113 incl %eax 114 jnz L(return_vec_2) 115 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 116 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 117 vpmovmskb %ymm4, %ecx 118 incl %ecx 119 jnz L(return_vec_3) 120 121 /* Go to 4x VEC loop. */ 122 cmpq $(VEC_SIZE * 8), %rdx 123 ja L(more_8x_vec) 124 125 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 126 branches. */ 127 128 /* Load first two VEC from s2 before adjusting addresses. */ 129 vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 130 vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 131 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi 132 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi 133 134 /* Wait to load from s1 until addressed adjust due to 135 unlamination of microfusion with complex address mode. */ 136 VPCMPEQ (%rdi), %ymm1, %ymm1 137 VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 138 139 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 140 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 141 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 142 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 143 144 /* Reduce VEC0 - VEC4. */ 145 vpand %ymm1, %ymm2, %ymm5 146 vpand %ymm3, %ymm4, %ymm6 147 vpand %ymm5, %ymm6, %ymm7 148 vpmovmskb %ymm7, %ecx 149 incl %ecx 150 jnz L(return_vec_0_1_2_3) 151 /* NB: eax must be zero to reach here. */ 152 VZEROUPPER_RETURN 153 154 .p2align 4 155L(return_vec_0): 156 tzcntl %eax, %eax 157# ifdef USE_AS_WMEMCMP 158 movl (%rdi, %rax), %ecx 159 xorl %edx, %edx 160 cmpl (%rsi, %rax), %ecx 161 /* NB: no partial register stall here because xorl zero idiom 162 above. */ 163 setg %dl 164 leal -1(%rdx, %rdx), %eax 165# else 166 movzbl (%rsi, %rax), %ecx 167 movzbl (%rdi, %rax), %eax 168 subl %ecx, %eax 169# endif 170L(return_vzeroupper): 171 ZERO_UPPER_VEC_REGISTERS_RETURN 172 173 .p2align 4 174L(return_vec_1): 175 tzcntl %eax, %eax 176# ifdef USE_AS_WMEMCMP 177 movl VEC_SIZE(%rdi, %rax), %ecx 178 xorl %edx, %edx 179 cmpl VEC_SIZE(%rsi, %rax), %ecx 180 setg %dl 181 leal -1(%rdx, %rdx), %eax 182# else 183 movzbl VEC_SIZE(%rsi, %rax), %ecx 184 movzbl VEC_SIZE(%rdi, %rax), %eax 185 subl %ecx, %eax 186# endif 187 VZEROUPPER_RETURN 188 189 .p2align 4 190L(return_vec_2): 191 tzcntl %eax, %eax 192# ifdef USE_AS_WMEMCMP 193 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx 194 xorl %edx, %edx 195 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx 196 setg %dl 197 leal -1(%rdx, %rdx), %eax 198# else 199 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 200 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 201 subl %ecx, %eax 202# endif 203 VZEROUPPER_RETURN 204 205 /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ 206 .p2align 5 207L(8x_return_vec_0_1_2_3): 208 /* Returning from L(more_8x_vec) requires restoring rsi. */ 209 addq %rdi, %rsi 210L(return_vec_0_1_2_3): 211 vpmovmskb %ymm1, %eax 212 incl %eax 213 jnz L(return_vec_0) 214 215 vpmovmskb %ymm2, %eax 216 incl %eax 217 jnz L(return_vec_1) 218 219 vpmovmskb %ymm3, %eax 220 incl %eax 221 jnz L(return_vec_2) 222L(return_vec_3): 223 tzcntl %ecx, %ecx 224# ifdef USE_AS_WMEMCMP 225 movl (VEC_SIZE * 3)(%rdi, %rcx), %eax 226 xorl %edx, %edx 227 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax 228 setg %dl 229 leal -1(%rdx, %rdx), %eax 230# else 231 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 232 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 233 subl %ecx, %eax 234# endif 235 VZEROUPPER_RETURN 236 237 .p2align 4 238L(more_8x_vec): 239 /* Set end of s1 in rdx. */ 240 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx 241 /* rsi stores s2 - s1. This allows loop to only update one 242 pointer. */ 243 subq %rdi, %rsi 244 /* Align s1 pointer. */ 245 andq $-VEC_SIZE, %rdi 246 /* Adjust because first 4x vec where check already. */ 247 subq $-(VEC_SIZE * 4), %rdi 248 .p2align 4 249L(loop_4x_vec): 250 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). 251 */ 252 vmovdqu (%rsi, %rdi), %ymm1 253 VPCMPEQ (%rdi), %ymm1, %ymm1 254 255 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 256 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 257 258 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 259 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 260 261 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 262 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 263 264 vpand %ymm1, %ymm2, %ymm5 265 vpand %ymm3, %ymm4, %ymm6 266 vpand %ymm5, %ymm6, %ymm7 267 vpmovmskb %ymm7, %ecx 268 incl %ecx 269 jnz L(8x_return_vec_0_1_2_3) 270 subq $-(VEC_SIZE * 4), %rdi 271 /* Check if s1 pointer at end. */ 272 cmpq %rdx, %rdi 273 jb L(loop_4x_vec) 274 275 subq %rdx, %rdi 276 /* rdi has 4 * VEC_SIZE - remaining length. */ 277 cmpl $(VEC_SIZE * 3), %edi 278 jae L(8x_last_1x_vec) 279 /* Load regardless of branch. */ 280 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 281 cmpl $(VEC_SIZE * 2), %edi 282 jae L(8x_last_2x_vec) 283 284 /* Check last 4 VEC. */ 285 vmovdqu (%rsi, %rdx), %ymm1 286 VPCMPEQ (%rdx), %ymm1, %ymm1 287 288 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 289 VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 290 291 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 292 293 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 294 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 295 296 vpand %ymm1, %ymm2, %ymm5 297 vpand %ymm3, %ymm4, %ymm6 298 vpand %ymm5, %ymm6, %ymm7 299 vpmovmskb %ymm7, %ecx 300 /* Restore s1 pointer to rdi. */ 301 movq %rdx, %rdi 302 incl %ecx 303 jnz L(8x_return_vec_0_1_2_3) 304 /* NB: eax must be zero to reach here. */ 305 VZEROUPPER_RETURN 306 307 /* Only entry is from L(more_8x_vec). */ 308 .p2align 4 309L(8x_last_2x_vec): 310 /* Check second to last VEC. rdx store end pointer of s1 and 311 ymm3 has already been loaded with second to last VEC from s2. 312 */ 313 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 314 vpmovmskb %ymm3, %eax 315 incl %eax 316 jnz L(8x_return_vec_2) 317 /* Check last VEC. */ 318 .p2align 4 319L(8x_last_1x_vec): 320 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 321 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 322 vpmovmskb %ymm4, %eax 323 incl %eax 324 jnz L(8x_return_vec_3) 325 VZEROUPPER_RETURN 326 327 .p2align 4 328L(last_2x_vec): 329 /* Check second to last VEC. */ 330 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 331 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 332 vpmovmskb %ymm1, %eax 333 incl %eax 334 jnz L(return_vec_1_end) 335 /* Check last VEC. */ 336L(last_1x_vec): 337 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 338 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 339 vpmovmskb %ymm1, %eax 340 incl %eax 341 jnz L(return_vec_0_end) 342 VZEROUPPER_RETURN 343 344 .p2align 4 345L(8x_return_vec_2): 346 subq $VEC_SIZE, %rdx 347L(8x_return_vec_3): 348 tzcntl %eax, %eax 349 addq %rdx, %rax 350# ifdef USE_AS_WMEMCMP 351 movl (VEC_SIZE * 3)(%rax), %ecx 352 xorl %edx, %edx 353 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx 354 setg %dl 355 leal -1(%rdx, %rdx), %eax 356# else 357 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx 358 movzbl (VEC_SIZE * 3)(%rax), %eax 359 subl %ecx, %eax 360# endif 361 VZEROUPPER_RETURN 362 363 .p2align 4 364L(return_vec_1_end): 365 tzcntl %eax, %eax 366 addl %edx, %eax 367# ifdef USE_AS_WMEMCMP 368 movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx 369 xorl %edx, %edx 370 cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 371 setg %dl 372 leal -1(%rdx, %rdx), %eax 373# else 374 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 375 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax 376 subl %ecx, %eax 377# endif 378 VZEROUPPER_RETURN 379 380 .p2align 4 381L(return_vec_0_end): 382 tzcntl %eax, %eax 383 addl %edx, %eax 384# ifdef USE_AS_WMEMCMP 385 movl -VEC_SIZE(%rdi, %rax), %ecx 386 xorl %edx, %edx 387 cmpl -VEC_SIZE(%rsi, %rax), %ecx 388 setg %dl 389 leal -1(%rdx, %rdx), %eax 390# else 391 movzbl -VEC_SIZE(%rsi, %rax), %ecx 392 movzbl -VEC_SIZE(%rdi, %rax), %eax 393 subl %ecx, %eax 394# endif 395 VZEROUPPER_RETURN 396 397 .p2align 4 398L(less_vec): 399 /* Check if one or less CHAR. This is necessary for size = 0 but 400 is also faster for size = CHAR_SIZE. */ 401 cmpl $CHAR_SIZE, %edx 402 jbe L(one_or_less) 403 404 /* Check if loading one VEC from either s1 or s2 could cause a 405 page cross. This can have false positives but is by far the 406 fastest method. */ 407 movl %edi, %eax 408 orl %esi, %eax 409 andl $(PAGE_SIZE - 1), %eax 410 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 411 jg L(page_cross_less_vec) 412 413 /* No page cross possible. */ 414 vmovdqu (%rsi), %ymm2 415 VPCMPEQ (%rdi), %ymm2, %ymm2 416 vpmovmskb %ymm2, %eax 417 incl %eax 418 /* Result will be zero if s1 and s2 match. Otherwise first set 419 bit will be first mismatch. */ 420 bzhil %edx, %eax, %edx 421 jnz L(return_vec_0) 422 xorl %eax, %eax 423 VZEROUPPER_RETURN 424 425 .p2align 4 426L(page_cross_less_vec): 427 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 428 bytes. */ 429 cmpl $16, %edx 430 jae L(between_16_31) 431# ifndef USE_AS_WMEMCMP 432 cmpl $8, %edx 433 jae L(between_8_15) 434 /* Fall through for [4, 7]. */ 435 cmpl $4, %edx 436 jb L(between_2_3) 437 438 movbe (%rdi), %eax 439 movbe (%rsi), %ecx 440 shlq $32, %rax 441 shlq $32, %rcx 442 movbe -4(%rdi, %rdx), %edi 443 movbe -4(%rsi, %rdx), %esi 444 orq %rdi, %rax 445 orq %rsi, %rcx 446 subq %rcx, %rax 447 /* Fast path for return zero. */ 448 jnz L(ret_nonzero) 449 /* No ymm register was touched. */ 450 ret 451 452 .p2align 4 453L(one_or_less): 454 jb L(zero) 455 movzbl (%rsi), %ecx 456 movzbl (%rdi), %eax 457 subl %ecx, %eax 458 /* No ymm register was touched. */ 459 ret 460 461 .p2align 4,, 5 462L(ret_nonzero): 463 sbbl %eax, %eax 464 orl $1, %eax 465 /* No ymm register was touched. */ 466 ret 467 468 .p2align 4,, 2 469L(zero): 470 xorl %eax, %eax 471 /* No ymm register was touched. */ 472 ret 473 474 .p2align 4 475L(between_8_15): 476 movbe (%rdi), %rax 477 movbe (%rsi), %rcx 478 subq %rcx, %rax 479 jnz L(ret_nonzero) 480 movbe -8(%rdi, %rdx), %rax 481 movbe -8(%rsi, %rdx), %rcx 482 subq %rcx, %rax 483 /* Fast path for return zero. */ 484 jnz L(ret_nonzero) 485 /* No ymm register was touched. */ 486 ret 487# else 488 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ 489 vmovq (%rdi), %xmm1 490 vmovq (%rsi), %xmm2 491 VPCMPEQ %xmm1, %xmm2, %xmm2 492 vpmovmskb %xmm2, %eax 493 subl $0xffff, %eax 494 jnz L(return_vec_0) 495 /* Use overlapping loads to avoid branches. */ 496 leaq -8(%rdi, %rdx), %rdi 497 leaq -8(%rsi, %rdx), %rsi 498 vmovq (%rdi), %xmm1 499 vmovq (%rsi), %xmm2 500 VPCMPEQ %xmm1, %xmm2, %xmm2 501 vpmovmskb %xmm2, %eax 502 subl $0xffff, %eax 503 /* Fast path for return zero. */ 504 jnz L(return_vec_0) 505 /* No ymm register was touched. */ 506 ret 507# endif 508 509 .p2align 4,, 10 510L(between_16_31): 511 /* From 16 to 31 bytes. No branch when size == 16. */ 512 vmovdqu (%rsi), %xmm2 513 VPCMPEQ (%rdi), %xmm2, %xmm2 514 vpmovmskb %xmm2, %eax 515 subl $0xffff, %eax 516 jnz L(return_vec_0) 517 518 /* Use overlapping loads to avoid branches. */ 519 520 vmovdqu -16(%rsi, %rdx), %xmm2 521 leaq -16(%rdi, %rdx), %rdi 522 leaq -16(%rsi, %rdx), %rsi 523 VPCMPEQ (%rdi), %xmm2, %xmm2 524 vpmovmskb %xmm2, %eax 525 subl $0xffff, %eax 526 /* Fast path for return zero. */ 527 jnz L(return_vec_0) 528 /* No ymm register was touched. */ 529 ret 530 531# ifdef USE_AS_WMEMCMP 532 .p2align 4,, 2 533L(zero): 534 xorl %eax, %eax 535 ret 536 537 .p2align 4 538L(one_or_less): 539 jb L(zero) 540 movl (%rdi), %ecx 541 xorl %edx, %edx 542 cmpl (%rsi), %ecx 543 je L(zero) 544 setg %dl 545 leal -1(%rdx, %rdx), %eax 546 /* No ymm register was touched. */ 547 ret 548# else 549 550 .p2align 4 551L(between_2_3): 552 /* Load as big endian to avoid branches. */ 553 movzwl (%rdi), %eax 554 movzwl (%rsi), %ecx 555 bswap %eax 556 bswap %ecx 557 shrl %eax 558 shrl %ecx 559 movzbl -1(%rdi, %rdx), %edi 560 movzbl -1(%rsi, %rdx), %esi 561 orl %edi, %eax 562 orl %esi, %ecx 563 /* Subtraction is okay because the upper bit is zero. */ 564 subl %ecx, %eax 565 /* No ymm register was touched. */ 566 ret 567# endif 568 569END (MEMCMP) 570#endif 571