1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23 24/* memcmp/wmemcmp is implemented as: 25 1. Use ymm vector compares when possible. The only case where 26 vector compares is not possible for when size < CHAR_PER_VEC 27 and loading from either s1 or s2 would cause a page cross. 28 2. For size from 2 to 7 bytes on page cross, load as big endian 29 with movbe and bswap to avoid branches. 30 3. Use xmm vector compare when size >= 4 bytes for memcmp or 31 size >= 8 bytes for wmemcmp. 32 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a 33 to check for early mismatches. Only do this if its guranteed the 34 work is not wasted. 35 5. If size is 8 * VEC_SIZE or less, unroll the loop. 36 6. Compare 4 * VEC_SIZE at a time with the aligned first memory 37 area. 38 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. 39 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. 40 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. 41 42When possible the implementation tries to optimize for frontend in the 43following ways: 44Throughput: 45 1. All code sections that fit are able to run optimally out of the 46 LSD. 47 2. All code sections that fit are able to run optimally out of the 48 DSB 49 3. Basic blocks are contained in minimum number of fetch blocks 50 necessary. 51 52Latency: 53 1. Logically connected basic blocks are put in the same 54 cache-line. 55 2. Logically connected basic blocks that do not fit in the same 56 cache-line are put in adjacent lines. This can get beneficial 57 L2 spatial prefetching and L1 next-line prefetching. */ 58 59# include <sysdep.h> 60 61# ifndef MEMCMP 62# define MEMCMP __memcmp_evex_movbe 63# endif 64 65# define VMOVU vmovdqu64 66 67# ifdef USE_AS_WMEMCMP 68# define VMOVU_MASK vmovdqu32 69# define CHAR_SIZE 4 70# define VPCMP vpcmpd 71# define VPTEST vptestmd 72# else 73# define VMOVU_MASK vmovdqu8 74# define CHAR_SIZE 1 75# define VPCMP vpcmpub 76# define VPTEST vptestmb 77# endif 78 79 80# define VEC_SIZE 32 81# define PAGE_SIZE 4096 82# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 83 84# define XMM0 xmm16 85# define XMM1 xmm17 86# define XMM2 xmm18 87# define YMM0 ymm16 88# define XMM1 xmm17 89# define XMM2 xmm18 90# define YMM1 ymm17 91# define YMM2 ymm18 92# define YMM3 ymm19 93# define YMM4 ymm20 94# define YMM5 ymm21 95# define YMM6 ymm22 96 97/* Warning! 98 wmemcmp has to use SIGNED comparison for elements. 99 memcmp has to use UNSIGNED comparison for elemnts. 100*/ 101 102 .section .text.evex,"ax",@progbits 103/* Cache align memcmp entry. This allows for much more thorough 104 frontend optimization. */ 105ENTRY_P2ALIGN (MEMCMP, 6) 106# ifdef __ILP32__ 107 /* Clear the upper 32 bits. */ 108 movl %edx, %edx 109# endif 110 cmp $CHAR_PER_VEC, %RDX_LP 111 /* Fall through for [0, VEC_SIZE] as its the hottest. */ 112 ja L(more_1x_vec) 113 114 /* Create mask for CHAR's we want to compare. This allows us to 115 avoid having to include page cross logic. */ 116 movl $-1, %ecx 117 bzhil %edx, %ecx, %ecx 118 kmovd %ecx, %k2 119 120 /* Safe to load full ymm with mask. */ 121 VMOVU_MASK (%rsi), %YMM2{%k2} 122 VPCMP $4,(%rdi), %YMM2, %k1{%k2} 123 kmovd %k1, %eax 124 testl %eax, %eax 125 jnz L(return_vec_0) 126 ret 127 128 .p2align 4 129L(return_vec_0): 130 tzcntl %eax, %eax 131# ifdef USE_AS_WMEMCMP 132 movl (%rdi, %rax, CHAR_SIZE), %ecx 133 xorl %edx, %edx 134 cmpl (%rsi, %rax, CHAR_SIZE), %ecx 135 /* NB: no partial register stall here because xorl zero idiom 136 above. */ 137 setg %dl 138 leal -1(%rdx, %rdx), %eax 139# else 140 movzbl (%rsi, %rax), %ecx 141 movzbl (%rdi, %rax), %eax 142 subl %ecx, %eax 143# endif 144 ret 145 146 147 .p2align 4 148L(more_1x_vec): 149 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 150 VMOVU (%rsi), %YMM1 151 /* Use compare not equals to directly check for mismatch. */ 152 VPCMP $4,(%rdi), %YMM1, %k1 153 kmovd %k1, %eax 154 /* NB: eax must be destination register if going to 155 L(return_vec_[0,2]). For L(return_vec_3) destination register 156 must be ecx. */ 157 testl %eax, %eax 158 jnz L(return_vec_0) 159 160 cmpq $(CHAR_PER_VEC * 2), %rdx 161 jbe L(last_1x_vec) 162 163 /* Check second VEC no matter what. */ 164 VMOVU VEC_SIZE(%rsi), %YMM2 165 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 166 kmovd %k1, %eax 167 testl %eax, %eax 168 jnz L(return_vec_1) 169 170 /* Less than 4 * VEC. */ 171 cmpq $(CHAR_PER_VEC * 4), %rdx 172 jbe L(last_2x_vec) 173 174 /* Check third and fourth VEC no matter what. */ 175 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 176 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 177 kmovd %k1, %eax 178 testl %eax, %eax 179 jnz L(return_vec_2) 180 181 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 182 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 183 kmovd %k1, %ecx 184 testl %ecx, %ecx 185 jnz L(return_vec_3) 186 187 /* Go to 4x VEC loop. */ 188 cmpq $(CHAR_PER_VEC * 8), %rdx 189 ja L(more_8x_vec) 190 191 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 192 branches. */ 193 194 /* Load first two VEC from s2 before adjusting addresses. */ 195 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 196 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 197 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi 198 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi 199 200 /* Wait to load from s1 until addressed adjust due to 201 unlamination of microfusion with complex address mode. */ 202 203 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it 204 will have some 1s. */ 205 vpxorq (%rdi), %YMM1, %YMM1 206 vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 207 208 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 209 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 210 211 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 212 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while 213 oring with YMM1. Result is stored in YMM4. */ 214 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 215 216 /* Or together YMM2, YMM3, and YMM4 into YMM4. */ 217 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 218 219 /* Test YMM4 against itself. Store any CHAR mismatches in k1. 220 */ 221 VPTEST %YMM4, %YMM4, %k1 222 /* k1 must go to ecx for L(return_vec_0_1_2_3). */ 223 kmovd %k1, %ecx 224 testl %ecx, %ecx 225 jnz L(return_vec_0_1_2_3) 226 /* NB: eax must be zero to reach here. */ 227 ret 228 229 230 .p2align 4,, 8 231L(8x_end_return_vec_0_1_2_3): 232 movq %rdx, %rdi 233L(8x_return_vec_0_1_2_3): 234 addq %rdi, %rsi 235L(return_vec_0_1_2_3): 236 VPTEST %YMM1, %YMM1, %k0 237 kmovd %k0, %eax 238 testl %eax, %eax 239 jnz L(return_vec_0) 240 241 VPTEST %YMM2, %YMM2, %k0 242 kmovd %k0, %eax 243 testl %eax, %eax 244 jnz L(return_vec_1) 245 246 VPTEST %YMM3, %YMM3, %k0 247 kmovd %k0, %eax 248 testl %eax, %eax 249 jnz L(return_vec_2) 250L(return_vec_3): 251 /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one 252 fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache 253 line. */ 254 bsfl %ecx, %ecx 255# ifdef USE_AS_WMEMCMP 256 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax 257 xorl %edx, %edx 258 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax 259 setg %dl 260 leal -1(%rdx, %rdx), %eax 261# else 262 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 263 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 264 subl %ecx, %eax 265# endif 266 ret 267 268 269 .p2align 4 270L(return_vec_1): 271 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one 272 fetch block. */ 273 bsfl %eax, %eax 274# ifdef USE_AS_WMEMCMP 275 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx 276 xorl %edx, %edx 277 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx 278 setg %dl 279 leal -1(%rdx, %rdx), %eax 280# else 281 movzbl VEC_SIZE(%rsi, %rax), %ecx 282 movzbl VEC_SIZE(%rdi, %rax), %eax 283 subl %ecx, %eax 284# endif 285 ret 286 287 .p2align 4,, 10 288L(return_vec_2): 289 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one 290 fetch block. */ 291 bsfl %eax, %eax 292# ifdef USE_AS_WMEMCMP 293 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx 294 xorl %edx, %edx 295 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx 296 setg %dl 297 leal -1(%rdx, %rdx), %eax 298# else 299 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 300 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 301 subl %ecx, %eax 302# endif 303 ret 304 305 .p2align 4 306L(more_8x_vec): 307 /* Set end of s1 in rdx. */ 308 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx 309 /* rsi stores s2 - s1. This allows loop to only update one 310 pointer. */ 311 subq %rdi, %rsi 312 /* Align s1 pointer. */ 313 andq $-VEC_SIZE, %rdi 314 /* Adjust because first 4x vec where check already. */ 315 subq $-(VEC_SIZE * 4), %rdi 316 317 .p2align 4 318L(loop_4x_vec): 319 VMOVU (%rsi, %rdi), %YMM1 320 vpxorq (%rdi), %YMM1, %YMM1 321 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 322 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 323 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 324 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 325 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 326 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 327 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 328 VPTEST %YMM4, %YMM4, %k1 329 kmovd %k1, %ecx 330 testl %ecx, %ecx 331 jnz L(8x_return_vec_0_1_2_3) 332 subq $-(VEC_SIZE * 4), %rdi 333 cmpq %rdx, %rdi 334 jb L(loop_4x_vec) 335 336 subq %rdx, %rdi 337 /* rdi has 4 * VEC_SIZE - remaining length. */ 338 cmpl $(VEC_SIZE * 3), %edi 339 jae L(8x_last_1x_vec) 340 /* Load regardless of branch. */ 341 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 342 cmpl $(VEC_SIZE * 2), %edi 343 jae L(8x_last_2x_vec) 344 345 vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 346 347 VMOVU (%rsi, %rdx), %YMM1 348 vpxorq (%rdx), %YMM1, %YMM1 349 350 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 351 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 352 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 353 vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 354 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 355 VPTEST %YMM4, %YMM4, %k1 356 kmovd %k1, %ecx 357 testl %ecx, %ecx 358 jnz L(8x_end_return_vec_0_1_2_3) 359 /* NB: eax must be zero to reach here. */ 360 ret 361 362 /* Only entry is from L(more_8x_vec). */ 363 .p2align 4,, 10 364L(8x_last_2x_vec): 365 VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 366 kmovd %k1, %eax 367 testl %eax, %eax 368 jnz L(8x_return_vec_2) 369 /* Naturally aligned to 16 bytes. */ 370L(8x_last_1x_vec): 371 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 372 VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 373 kmovd %k1, %eax 374 testl %eax, %eax 375 jnz L(8x_return_vec_3) 376 ret 377 378 /* Not ideally aligned (at offset +9 bytes in fetch block) but 379 not aligning keeps it in the same cache line as 380 L(8x_last_1x/2x_vec) so likely worth it. As well, saves code 381 size. */ 382 .p2align 4,, 4 383L(8x_return_vec_2): 384 subq $VEC_SIZE, %rdx 385L(8x_return_vec_3): 386 bsfl %eax, %eax 387# ifdef USE_AS_WMEMCMP 388 leaq (%rdx, %rax, CHAR_SIZE), %rax 389 movl (VEC_SIZE * 3)(%rax), %ecx 390 xorl %edx, %edx 391 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx 392 setg %dl 393 leal -1(%rdx, %rdx), %eax 394# else 395 addq %rdx, %rax 396 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx 397 movzbl (VEC_SIZE * 3)(%rax), %eax 398 subl %ecx, %eax 399# endif 400 ret 401 402 .p2align 4,, 10 403L(last_2x_vec): 404 /* Check second to last VEC. */ 405 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 406 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 407 kmovd %k1, %eax 408 testl %eax, %eax 409 jnz L(return_vec_1_end) 410 411 /* Check last VEC. */ 412 .p2align 4 413L(last_1x_vec): 414 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 415 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 416 kmovd %k1, %eax 417 testl %eax, %eax 418 jnz L(return_vec_0_end) 419 ret 420 421 422 /* Don't align. Takes 2-fetch blocks either way and aligning 423 will cause code to spill into another cacheline. */ 424L(return_vec_1_end): 425 /* Use bsf to save code size. This is necessary to have 426 L(one_or_less) fit in aligning bytes between. */ 427 bsfl %eax, %eax 428 addl %edx, %eax 429# ifdef USE_AS_WMEMCMP 430 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx 431 xorl %edx, %edx 432 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx 433 setg %dl 434 leal -1(%rdx, %rdx), %eax 435# else 436 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 437 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax 438 subl %ecx, %eax 439# endif 440 ret 441 442 /* Don't align. Takes 2-fetch blocks either way and aligning 443 will cause code to spill into another cacheline. */ 444L(return_vec_0_end): 445 tzcntl %eax, %eax 446 addl %edx, %eax 447# ifdef USE_AS_WMEMCMP 448 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx 449 xorl %edx, %edx 450 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx 451 setg %dl 452 leal -1(%rdx, %rdx), %eax 453# else 454 movzbl -VEC_SIZE(%rsi, %rax), %ecx 455 movzbl -VEC_SIZE(%rdi, %rax), %eax 456 subl %ecx, %eax 457# endif 458 ret 459 /* 1-byte until next cache line. */ 460 461END (MEMCMP) 462#endif 463