1/* __memcmpeq optimized with AVX2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23/* __memcmpeq is implemented as: 24 1. Use ymm vector compares when possible. The only case where 25 vector compares is not possible for when size < VEC_SIZE 26 and loading from either s1 or s2 would cause a page cross. 27 2. Use xmm vector compare when size >= 8 bytes. 28 3. Optimistically compare up to first 4 * VEC_SIZE one at a 29 to check for early mismatches. Only do this if its guranteed the 30 work is not wasted. 31 4. If size is 8 * VEC_SIZE or less, unroll the loop. 32 5. Compare 4 * VEC_SIZE at a time with the aligned first memory 33 area. 34 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. 35 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. 36 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ 37 38# include <sysdep.h> 39 40# ifndef MEMCMPEQ 41# define MEMCMPEQ __memcmpeq_avx2 42# endif 43 44# define VPCMPEQ vpcmpeqb 45 46# ifndef VZEROUPPER 47# define VZEROUPPER vzeroupper 48# endif 49 50# ifndef SECTION 51# define SECTION(p) p##.avx 52# endif 53 54# define VEC_SIZE 32 55# define PAGE_SIZE 4096 56 57 .section SECTION(.text), "ax", @progbits 58ENTRY_P2ALIGN (MEMCMPEQ, 6) 59# ifdef __ILP32__ 60 /* Clear the upper 32 bits. */ 61 movl %edx, %edx 62# endif 63 cmp $VEC_SIZE, %RDX_LP 64 jb L(less_vec) 65 66 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 67 vmovdqu (%rsi), %ymm1 68 VPCMPEQ (%rdi), %ymm1, %ymm1 69 vpmovmskb %ymm1, %eax 70 incl %eax 71 jnz L(return_neq0) 72 cmpq $(VEC_SIZE * 2), %rdx 73 jbe L(last_1x_vec) 74 75 /* Check second VEC no matter what. */ 76 vmovdqu VEC_SIZE(%rsi), %ymm2 77 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 78 vpmovmskb %ymm2, %eax 79 /* If all 4 VEC where equal eax will be all 1s so incl will overflow 80 and set zero flag. */ 81 incl %eax 82 jnz L(return_neq0) 83 84 /* Less than 4 * VEC. */ 85 cmpq $(VEC_SIZE * 4), %rdx 86 jbe L(last_2x_vec) 87 88 /* Check third and fourth VEC no matter what. */ 89 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 90 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 91 vpmovmskb %ymm3, %eax 92 incl %eax 93 jnz L(return_neq0) 94 95 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 96 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 97 vpmovmskb %ymm4, %eax 98 incl %eax 99 jnz L(return_neq0) 100 101 /* Go to 4x VEC loop. */ 102 cmpq $(VEC_SIZE * 8), %rdx 103 ja L(more_8x_vec) 104 105 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 106 branches. */ 107 108 /* Adjust rsi and rdi to avoid indexed address mode. This end up 109 saving a 16 bytes of code, prevents unlamination, and bottlenecks in 110 the AGU. */ 111 addq %rdx, %rsi 112 vmovdqu -(VEC_SIZE * 4)(%rsi), %ymm1 113 vmovdqu -(VEC_SIZE * 3)(%rsi), %ymm2 114 addq %rdx, %rdi 115 116 VPCMPEQ -(VEC_SIZE * 4)(%rdi), %ymm1, %ymm1 117 VPCMPEQ -(VEC_SIZE * 3)(%rdi), %ymm2, %ymm2 118 119 vmovdqu -(VEC_SIZE * 2)(%rsi), %ymm3 120 VPCMPEQ -(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 121 vmovdqu -VEC_SIZE(%rsi), %ymm4 122 VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4 123 124 /* Reduce VEC0 - VEC4. */ 125 vpand %ymm1, %ymm2, %ymm2 126 vpand %ymm3, %ymm4, %ymm4 127 vpand %ymm2, %ymm4, %ymm4 128 vpmovmskb %ymm4, %eax 129 incl %eax 130L(return_neq0): 131L(return_vzeroupper): 132 ZERO_UPPER_VEC_REGISTERS_RETURN 133 134 /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte 135 aligned. */ 136 .p2align 5 137L(less_vec): 138 /* Check if one or less char. This is necessary for size = 0 but is 139 also faster for size = 1. */ 140 cmpl $1, %edx 141 jbe L(one_or_less) 142 143 /* Check if loading one VEC from either s1 or s2 could cause a page 144 cross. This can have false positives but is by far the fastest 145 method. */ 146 movl %edi, %eax 147 orl %esi, %eax 148 andl $(PAGE_SIZE - 1), %eax 149 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 150 jg L(page_cross_less_vec) 151 152 /* No page cross possible. */ 153 vmovdqu (%rsi), %ymm2 154 VPCMPEQ (%rdi), %ymm2, %ymm2 155 vpmovmskb %ymm2, %eax 156 incl %eax 157 /* Result will be zero if s1 and s2 match. Otherwise first set bit 158 will be first mismatch. */ 159 bzhil %edx, %eax, %eax 160 VZEROUPPER_RETURN 161 162 /* Relatively cold but placing close to L(less_vec) for 2 byte jump 163 encoding. */ 164 .p2align 4 165L(one_or_less): 166 jb L(zero) 167 movzbl (%rsi), %ecx 168 movzbl (%rdi), %eax 169 subl %ecx, %eax 170 /* No ymm register was touched. */ 171 ret 172 /* Within the same 16 byte block is L(one_or_less). */ 173L(zero): 174 xorl %eax, %eax 175 ret 176 177 .p2align 4 178L(last_1x_vec): 179 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 180 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 181 vpmovmskb %ymm1, %eax 182 incl %eax 183 VZEROUPPER_RETURN 184 185 .p2align 4 186L(last_2x_vec): 187 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 188 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 189 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm2 190 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm2, %ymm2 191 vpand %ymm1, %ymm2, %ymm2 192 vpmovmskb %ymm2, %eax 193 incl %eax 194 VZEROUPPER_RETURN 195 196 .p2align 4 197L(more_8x_vec): 198 /* Set end of s1 in rdx. */ 199 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx 200 /* rsi stores s2 - s1. This allows loop to only update one pointer. 201 */ 202 subq %rdi, %rsi 203 /* Align s1 pointer. */ 204 andq $-VEC_SIZE, %rdi 205 /* Adjust because first 4x vec where check already. */ 206 subq $-(VEC_SIZE * 4), %rdi 207 .p2align 4 208L(loop_4x_vec): 209 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). */ 210 vmovdqu (%rsi, %rdi), %ymm1 211 VPCMPEQ (%rdi), %ymm1, %ymm1 212 213 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 214 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 215 216 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 217 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 218 219 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 220 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 221 222 vpand %ymm1, %ymm2, %ymm2 223 vpand %ymm3, %ymm4, %ymm4 224 vpand %ymm2, %ymm4, %ymm4 225 vpmovmskb %ymm4, %eax 226 incl %eax 227 jnz L(return_neq1) 228 subq $-(VEC_SIZE * 4), %rdi 229 /* Check if s1 pointer at end. */ 230 cmpq %rdx, %rdi 231 jb L(loop_4x_vec) 232 233 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 234 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 235 subq %rdx, %rdi 236 /* rdi has 4 * VEC_SIZE - remaining length. */ 237 cmpl $(VEC_SIZE * 3), %edi 238 jae L(8x_last_1x_vec) 239 /* Load regardless of branch. */ 240 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 241 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 242 cmpl $(VEC_SIZE * 2), %edi 243 jae L(8x_last_2x_vec) 244 /* Check last 4 VEC. */ 245 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1 246 VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1 247 248 vmovdqu (%rsi, %rdx), %ymm2 249 VPCMPEQ (%rdx), %ymm2, %ymm2 250 251 vpand %ymm3, %ymm4, %ymm4 252 vpand %ymm1, %ymm2, %ymm3 253L(8x_last_2x_vec): 254 vpand %ymm3, %ymm4, %ymm4 255L(8x_last_1x_vec): 256 vpmovmskb %ymm4, %eax 257 /* Restore s1 pointer to rdi. */ 258 incl %eax 259L(return_neq1): 260 VZEROUPPER_RETURN 261 262 /* Relatively cold case as page cross are unexpected. */ 263 .p2align 4 264L(page_cross_less_vec): 265 cmpl $16, %edx 266 jae L(between_16_31) 267 cmpl $8, %edx 268 ja L(between_9_15) 269 cmpl $4, %edx 270 jb L(between_2_3) 271 /* From 4 to 8 bytes. No branch when size == 4. */ 272 movl (%rdi), %eax 273 subl (%rsi), %eax 274 movl -4(%rdi, %rdx), %ecx 275 movl -4(%rsi, %rdx), %edi 276 subl %edi, %ecx 277 orl %ecx, %eax 278 ret 279 280 .p2align 4,, 8 281L(between_16_31): 282 /* From 16 to 31 bytes. No branch when size == 16. */ 283 284 /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. 285 */ 286 vmovdqu (%rsi), %xmm1 287 vpcmpeqb (%rdi), %xmm1, %xmm1 288 vmovdqu -16(%rsi, %rdx), %xmm2 289 vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 290 vpand %xmm1, %xmm2, %xmm2 291 vpmovmskb %xmm2, %eax 292 notw %ax 293 /* No ymm register was touched. */ 294 ret 295 296 .p2align 4,, 8 297L(between_9_15): 298 /* From 9 to 15 bytes. */ 299 movq (%rdi), %rax 300 subq (%rsi), %rax 301 movq -8(%rdi, %rdx), %rcx 302 movq -8(%rsi, %rdx), %rdi 303 subq %rdi, %rcx 304 orq %rcx, %rax 305 /* edx is guranteed to be a non-zero int. */ 306 cmovnz %edx, %eax 307 ret 308 309 /* Don't align. This is cold and aligning here will cause code 310 to spill into next cache line. */ 311L(between_2_3): 312 /* From 2 to 3 bytes. No branch when size == 2. */ 313 movzwl (%rdi), %eax 314 movzwl (%rsi), %ecx 315 subl %ecx, %eax 316 movzbl -1(%rdi, %rdx), %ecx 317 /* All machines that support evex will insert a "merging uop" 318 avoiding any serious partial register stalls. */ 319 subb -1(%rsi, %rdx), %cl 320 orl %ecx, %eax 321 /* No ymm register was touched. */ 322 ret 323 324 /* 2 Bytes from next cache line. */ 325END (MEMCMPEQ) 326#endif 327