1/* memchr/wmemchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20#include <sysdep.h> 21 22#if ISA_SHOULD_BUILD (4) 23 24# ifndef MEMCHR 25# define MEMCHR __memchr_evex 26# endif 27 28# ifdef USE_AS_WMEMCHR 29# define VPBROADCAST vpbroadcastd 30# define VPMINU vpminud 31# define VPCMP vpcmpd 32# define VPCMPEQ vpcmpeqd 33# define CHAR_SIZE 4 34# else 35# define VPBROADCAST vpbroadcastb 36# define VPMINU vpminub 37# define VPCMP vpcmpb 38# define VPCMPEQ vpcmpeqb 39# define CHAR_SIZE 1 40# endif 41 42 /* In the 4x loop the RTM and non-RTM versions have data pointer 43 off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. 44 This is represented by BASE_OFFSET. As well because the RTM 45 version uses vpcmp which stores a bit per element compared where 46 the non-RTM version uses vpcmpeq which stores a bit per byte 47 compared RET_SCALE of CHAR_SIZE is only relevant for the RTM 48 version. */ 49# ifdef USE_IN_RTM 50# define VZEROUPPER 51# define BASE_OFFSET (VEC_SIZE * 4) 52# define RET_SCALE CHAR_SIZE 53# else 54# define VZEROUPPER vzeroupper 55# define BASE_OFFSET 0 56# define RET_SCALE 1 57# endif 58 59 /* In the return from 4x loop memchr and rawmemchr versions have 60 data pointers off by VEC_SIZE * 4 with memchr version being 61 VEC_SIZE * 4 greater. */ 62# ifdef USE_AS_RAWMEMCHR 63# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) 64# define RAW_PTR_REG rcx 65# define ALGN_PTR_REG rdi 66# else 67# define RET_OFFSET BASE_OFFSET 68# define RAW_PTR_REG rdi 69# define ALGN_PTR_REG rcx 70# endif 71 72# define XMMZERO xmm23 73# define YMMZERO ymm23 74# define XMMMATCH xmm16 75# define YMMMATCH ymm16 76# define YMM1 ymm17 77# define YMM2 ymm18 78# define YMM3 ymm19 79# define YMM4 ymm20 80# define YMM5 ymm21 81# define YMM6 ymm22 82 83# ifndef SECTION 84# define SECTION(p) p##.evex 85# endif 86 87# define VEC_SIZE 32 88# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 89# define PAGE_SIZE 4096 90 91 .section SECTION(.text),"ax",@progbits 92ENTRY_P2ALIGN (MEMCHR, 6) 93# ifndef USE_AS_RAWMEMCHR 94 /* Check for zero length. */ 95 test %RDX_LP, %RDX_LP 96 jz L(zero) 97 98# ifdef __ILP32__ 99 /* Clear the upper 32 bits. */ 100 movl %edx, %edx 101# endif 102# endif 103 /* Broadcast CHAR to YMMMATCH. */ 104 VPBROADCAST %esi, %YMMMATCH 105 /* Check if we may cross page boundary with one vector load. */ 106 movl %edi, %eax 107 andl $(PAGE_SIZE - 1), %eax 108 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 109 ja L(cross_page_boundary) 110 111 /* Check the first VEC_SIZE bytes. */ 112 VPCMP $0, (%rdi), %YMMMATCH, %k0 113 kmovd %k0, %eax 114# ifndef USE_AS_RAWMEMCHR 115 /* If length < CHAR_PER_VEC handle special. */ 116 cmpq $CHAR_PER_VEC, %rdx 117 jbe L(first_vec_x0) 118# endif 119 testl %eax, %eax 120 jz L(aligned_more) 121 tzcntl %eax, %eax 122# ifdef USE_AS_WMEMCHR 123 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 124 leaq (%rdi, %rax, CHAR_SIZE), %rax 125# else 126 addq %rdi, %rax 127# endif 128 ret 129 130# ifndef USE_AS_RAWMEMCHR 131L(zero): 132 xorl %eax, %eax 133 ret 134 135 .p2align 4 136L(first_vec_x0): 137 /* Check if first match was before length. NB: tzcnt has false data- 138 dependency on destination. eax already had a data-dependency on esi 139 so this should have no affect here. */ 140 tzcntl %eax, %esi 141# ifdef USE_AS_WMEMCHR 142 leaq (%rdi, %rsi, CHAR_SIZE), %rdi 143# else 144 addq %rsi, %rdi 145# endif 146 xorl %eax, %eax 147 cmpl %esi, %edx 148 cmovg %rdi, %rax 149 ret 150# endif 151 152 .p2align 4 153L(cross_page_boundary): 154 /* Save pointer before aligning as its original value is 155 necessary for computer return address if byte is found or 156 adjusting length if it is not and this is memchr. */ 157 movq %rdi, %rcx 158 /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi 159 for rawmemchr. */ 160 andq $-VEC_SIZE, %ALGN_PTR_REG 161 VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 162 kmovd %k0, %r8d 163# ifdef USE_AS_WMEMCHR 164 /* NB: Divide shift count by 4 since each bit in K0 represent 4 165 bytes. */ 166 sarl $2, %eax 167# endif 168# ifndef USE_AS_RAWMEMCHR 169 movl $(PAGE_SIZE / CHAR_SIZE), %esi 170 subl %eax, %esi 171# endif 172# ifdef USE_AS_WMEMCHR 173 andl $(CHAR_PER_VEC - 1), %eax 174# endif 175 /* Remove the leading bytes. */ 176 sarxl %eax, %r8d, %eax 177# ifndef USE_AS_RAWMEMCHR 178 /* Check the end of data. */ 179 cmpq %rsi, %rdx 180 jbe L(first_vec_x0) 181# endif 182 testl %eax, %eax 183 jz L(cross_page_continue) 184 tzcntl %eax, %eax 185# ifdef USE_AS_WMEMCHR 186 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 187 leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax 188# else 189 addq %RAW_PTR_REG, %rax 190# endif 191 ret 192 193 .p2align 4 194L(first_vec_x1): 195 tzcntl %eax, %eax 196 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax 197 ret 198 199 .p2align 4 200L(first_vec_x2): 201 tzcntl %eax, %eax 202 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 203 ret 204 205 .p2align 4 206L(first_vec_x3): 207 tzcntl %eax, %eax 208 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 209 ret 210 211 .p2align 4 212L(first_vec_x4): 213 tzcntl %eax, %eax 214 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 215 ret 216 217 .p2align 5 218L(aligned_more): 219 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 220 since data is only aligned to VEC_SIZE. */ 221 222# ifndef USE_AS_RAWMEMCHR 223 /* Align data to VEC_SIZE. */ 224L(cross_page_continue): 225 xorl %ecx, %ecx 226 subl %edi, %ecx 227 andq $-VEC_SIZE, %rdi 228 /* esi is for adjusting length to see if near the end. */ 229 leal (VEC_SIZE * 5)(%rdi, %rcx), %esi 230# ifdef USE_AS_WMEMCHR 231 /* NB: Divide bytes by 4 to get the wchar_t count. */ 232 sarl $2, %esi 233# endif 234# else 235 andq $-VEC_SIZE, %rdi 236L(cross_page_continue): 237# endif 238 /* Load first VEC regardless. */ 239 VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 240 kmovd %k0, %eax 241# ifndef USE_AS_RAWMEMCHR 242 /* Adjust length. If near end handle specially. */ 243 subq %rsi, %rdx 244 jbe L(last_4x_vec_or_less) 245# endif 246 testl %eax, %eax 247 jnz L(first_vec_x1) 248 249 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 250 kmovd %k0, %eax 251 testl %eax, %eax 252 jnz L(first_vec_x2) 253 254 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 255 kmovd %k0, %eax 256 testl %eax, %eax 257 jnz L(first_vec_x3) 258 259 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 260 kmovd %k0, %eax 261 testl %eax, %eax 262 jnz L(first_vec_x4) 263 264 265# ifndef USE_AS_RAWMEMCHR 266 /* Check if at last CHAR_PER_VEC * 4 length. */ 267 subq $(CHAR_PER_VEC * 4), %rdx 268 jbe L(last_4x_vec_or_less_cmpeq) 269 /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ 270 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi 271 272 /* Align data to VEC_SIZE * 4 for the loop and readjust length. 273 */ 274# ifdef USE_AS_WMEMCHR 275 movl %edi, %ecx 276 andq $-(4 * VEC_SIZE), %rdi 277 subl %edi, %ecx 278 /* NB: Divide bytes by 4 to get the wchar_t count. */ 279 sarl $2, %ecx 280 addq %rcx, %rdx 281# else 282 addq %rdi, %rdx 283 andq $-(4 * VEC_SIZE), %rdi 284 subq %rdi, %rdx 285# endif 286# else 287 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi 288 andq $-(4 * VEC_SIZE), %rdi 289# endif 290# ifdef USE_IN_RTM 291 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 292# else 293 /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not 294 encodable with EVEX registers (ymm16-ymm31). */ 295 vmovdqa64 %YMMMATCH, %ymm0 296# endif 297 298 /* Compare 4 * VEC at a time forward. */ 299 .p2align 4 300L(loop_4x_vec): 301 /* Two versions of the loop. One that does not require 302 vzeroupper by not using ymm0-ymm15 and another does that require 303 vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 304 is used at all is because there is no EVEX encoding vpcmpeq and 305 with vpcmpeq this loop can be performed more efficiently. The 306 non-vzeroupper version is safe for RTM while the vzeroupper 307 version should be prefered if RTM are not supported. */ 308# ifdef USE_IN_RTM 309 /* It would be possible to save some instructions using 4x VPCMP 310 but bottleneck on port 5 makes it not woth it. */ 311 VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 312 /* xor will set bytes match esi to zero. */ 313 vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 314 vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 315 VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 316 /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ 317 VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} 318 VPCMP $0, %YMM3, %YMMZERO, %k2 319# else 320 /* Since vptern can only take 3x vectors fastest to do 1 vec 321 seperately with EVEX vpcmp. */ 322# ifdef USE_AS_WMEMCHR 323 /* vptern can only accept masks for epi32/epi64 so can only save 324 instruction using not equals mask on vptern with wmemchr. */ 325 VPCMP $4, (%rdi), %YMMMATCH, %k1 326# else 327 VPCMP $0, (%rdi), %YMMMATCH, %k1 328# endif 329 /* Compare 3x with vpcmpeq and or them all together with vptern. 330 */ 331 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 332 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 333 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 334# ifdef USE_AS_WMEMCHR 335 /* This takes the not of or between ymm2, ymm3, ymm4 as well as 336 combines result from VEC0 with zero mask. */ 337 vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} 338 vpmovmskb %ymm4, %ecx 339# else 340 /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ 341 vpternlogd $254, %ymm2, %ymm3, %ymm4 342 vpmovmskb %ymm4, %ecx 343 kmovd %k1, %eax 344# endif 345# endif 346 347# ifdef USE_AS_RAWMEMCHR 348 subq $-(VEC_SIZE * 4), %rdi 349# endif 350# ifdef USE_IN_RTM 351 kortestd %k2, %k3 352# else 353# ifdef USE_AS_WMEMCHR 354 /* ecx contains not of matches. All 1s means no matches. incl will 355 overflow and set zeroflag if that is the case. */ 356 incl %ecx 357# else 358 /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding 359 to ecx is not an issue because if eax is non-zero it will be 360 used for returning the match. If it is zero the add does 361 nothing. */ 362 addq %rax, %rcx 363# endif 364# endif 365# ifdef USE_AS_RAWMEMCHR 366 jz L(loop_4x_vec) 367# else 368 jnz L(loop_4x_vec_end) 369 370 subq $-(VEC_SIZE * 4), %rdi 371 372 subq $(CHAR_PER_VEC * 4), %rdx 373 ja L(loop_4x_vec) 374 375 /* Fall through into less than 4 remaining vectors of length case. 376 */ 377 VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 378 addq $(BASE_OFFSET - VEC_SIZE), %rdi 379 kmovd %k0, %eax 380 VZEROUPPER 381 382L(last_4x_vec_or_less): 383 /* Check if first VEC contained match. */ 384 testl %eax, %eax 385 jnz L(first_vec_x1_check) 386 387 /* If remaining length > CHAR_PER_VEC * 2. */ 388 addl $(CHAR_PER_VEC * 2), %edx 389 jg L(last_4x_vec) 390 391L(last_2x_vec): 392 /* If remaining length < CHAR_PER_VEC. */ 393 addl $CHAR_PER_VEC, %edx 394 jle L(zero_end) 395 396 /* Check VEC2 and compare any match with remaining length. */ 397 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 398 kmovd %k0, %eax 399 tzcntl %eax, %eax 400 cmpl %eax, %edx 401 jbe L(set_zero_end) 402 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 403L(zero_end): 404 ret 405 406L(set_zero_end): 407 xorl %eax, %eax 408 ret 409 410 .p2align 4 411L(first_vec_x1_check): 412 /* eax must be non-zero. Use bsfl to save code size. */ 413 bsfl %eax, %eax 414 /* Adjust length. */ 415 subl $-(CHAR_PER_VEC * 4), %edx 416 /* Check if match within remaining length. */ 417 cmpl %eax, %edx 418 jbe L(set_zero_end) 419 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 420 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax 421 ret 422 423 .p2align 4 424L(loop_4x_vec_end): 425# endif 426 /* rawmemchr will fall through into this if match was found in 427 loop. */ 428 429# if defined USE_IN_RTM || defined USE_AS_WMEMCHR 430 /* k1 has not of matches with VEC1. */ 431 kmovd %k1, %eax 432# ifdef USE_AS_WMEMCHR 433 subl $((1 << CHAR_PER_VEC) - 1), %eax 434# else 435 incl %eax 436# endif 437# else 438 /* eax already has matches for VEC1. */ 439 testl %eax, %eax 440# endif 441 jnz L(last_vec_x1_return) 442 443# ifdef USE_IN_RTM 444 VPCMP $0, %YMM2, %YMMZERO, %k0 445 kmovd %k0, %eax 446# else 447 vpmovmskb %ymm2, %eax 448# endif 449 testl %eax, %eax 450 jnz L(last_vec_x2_return) 451 452# ifdef USE_IN_RTM 453 kmovd %k2, %eax 454 testl %eax, %eax 455 jnz L(last_vec_x3_return) 456 457 kmovd %k3, %eax 458 tzcntl %eax, %eax 459 leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax 460# else 461 vpmovmskb %ymm3, %eax 462 /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ 463 salq $VEC_SIZE, %rcx 464 orq %rcx, %rax 465 tzcntq %rax, %rax 466 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax 467 VZEROUPPER 468# endif 469 ret 470 471 .p2align 4,, 10 472L(last_vec_x1_return): 473 tzcntl %eax, %eax 474# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 475 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 476 leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax 477# else 478 addq %rdi, %rax 479# endif 480 VZEROUPPER 481 ret 482 483 .p2align 4 484L(last_vec_x2_return): 485 tzcntl %eax, %eax 486 /* NB: Multiply bytes by RET_SCALE to get the wchar_t count 487 if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and 488 USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ 489 leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax 490 VZEROUPPER 491 ret 492 493# ifdef USE_IN_RTM 494 .p2align 4 495L(last_vec_x3_return): 496 tzcntl %eax, %eax 497 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 498 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax 499 ret 500# endif 501 502# ifndef USE_AS_RAWMEMCHR 503 .p2align 4,, 5 504L(last_4x_vec_or_less_cmpeq): 505 VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 506 kmovd %k0, %eax 507 subq $-(VEC_SIZE * 4), %rdi 508 /* Check first VEC regardless. */ 509 testl %eax, %eax 510 jnz L(first_vec_x1_check) 511 512 /* If remaining length <= CHAR_PER_VEC * 2. */ 513 addl $(CHAR_PER_VEC * 2), %edx 514 jle L(last_2x_vec) 515 516 .p2align 4 517L(last_4x_vec): 518 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 519 kmovd %k0, %eax 520 testl %eax, %eax 521 jnz L(last_vec_x2) 522 523 524 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 525 kmovd %k0, %eax 526 /* Create mask for possible matches within remaining length. */ 527# ifdef USE_AS_WMEMCHR 528 movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx 529 bzhil %edx, %ecx, %ecx 530# else 531 movq $-1, %rcx 532 bzhiq %rdx, %rcx, %rcx 533# endif 534 /* Test matches in data against length match. */ 535 andl %ecx, %eax 536 jnz L(last_vec_x3) 537 538 /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after 539 remaining length was found to be > CHAR_PER_VEC * 2. */ 540 subl $CHAR_PER_VEC, %edx 541 jbe L(zero_end2) 542 543 544 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 545 kmovd %k0, %eax 546 /* Shift remaining length mask for last VEC. */ 547# ifdef USE_AS_WMEMCHR 548 shrl $CHAR_PER_VEC, %ecx 549# else 550 shrq $CHAR_PER_VEC, %rcx 551# endif 552 andl %ecx, %eax 553 jz L(zero_end2) 554 bsfl %eax, %eax 555 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 556L(zero_end2): 557 ret 558 559L(last_vec_x2): 560 tzcntl %eax, %eax 561 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 562 ret 563 564 .p2align 4 565L(last_vec_x3): 566 tzcntl %eax, %eax 567 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 568 ret 569# endif 570 /* 7 bytes from next cache line. */ 571END (MEMCHR) 572#endif 573