1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23# include <sysdep.h> 24 25# ifndef STRLEN 26# define STRLEN __strlen_avx2 27# endif 28 29# ifdef USE_AS_WCSLEN 30# define VPCMPEQ vpcmpeqd 31# define VPMINU vpminud 32# define CHAR_SIZE 4 33# else 34# define VPCMPEQ vpcmpeqb 35# define VPMINU vpminub 36# define CHAR_SIZE 1 37# endif 38 39# ifndef VZEROUPPER 40# define VZEROUPPER vzeroupper 41# endif 42 43# ifndef SECTION 44# define SECTION(p) p##.avx 45# endif 46 47# define VEC_SIZE 32 48# define PAGE_SIZE 4096 49# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 50 51 .section SECTION(.text),"ax",@progbits 52ENTRY (STRLEN) 53# ifdef USE_AS_STRNLEN 54 /* Check zero length. */ 55# ifdef __ILP32__ 56 /* Clear upper bits. */ 57 and %RSI_LP, %RSI_LP 58# else 59 test %RSI_LP, %RSI_LP 60# endif 61 jz L(zero) 62 /* Store max len in R8_LP before adjusting if using WCSLEN. */ 63 mov %RSI_LP, %R8_LP 64# endif 65 movl %edi, %eax 66 movq %rdi, %rdx 67 vpxor %xmm0, %xmm0, %xmm0 68 /* Clear high bits from edi. Only keeping bits relevant to page 69 cross check. */ 70 andl $(PAGE_SIZE - 1), %eax 71 /* Check if we may cross page boundary with one vector load. */ 72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 73 ja L(cross_page_boundary) 74 75 /* Check the first VEC_SIZE bytes. */ 76 VPCMPEQ (%rdi), %ymm0, %ymm1 77 vpmovmskb %ymm1, %eax 78# ifdef USE_AS_STRNLEN 79 /* If length < VEC_SIZE handle special. */ 80 cmpq $CHAR_PER_VEC, %rsi 81 jbe L(first_vec_x0) 82# endif 83 /* If empty continue to aligned_more. Otherwise return bit 84 position of first match. */ 85 testl %eax, %eax 86 jz L(aligned_more) 87 tzcntl %eax, %eax 88# ifdef USE_AS_WCSLEN 89 /* NB: Divide bytes by 4 to get wchar_t count. */ 90 shrl $2, %eax 91# endif 92 VZEROUPPER_RETURN 93 94# ifdef USE_AS_STRNLEN 95L(zero): 96 xorl %eax, %eax 97 ret 98 99 .p2align 4 100L(first_vec_x0): 101 /* Set bit for max len so that tzcnt will return min of max len 102 and position of first match. */ 103# ifdef USE_AS_WCSLEN 104 /* NB: Multiply length by 4 to get byte count. */ 105 sall $2, %esi 106# endif 107 btsq %rsi, %rax 108 tzcntl %eax, %eax 109# ifdef USE_AS_WCSLEN 110 /* NB: Divide bytes by 4 to get wchar_t count. */ 111 shrl $2, %eax 112# endif 113 VZEROUPPER_RETURN 114# endif 115 116 .p2align 4 117L(first_vec_x1): 118 tzcntl %eax, %eax 119 /* Safe to use 32 bit instructions as these are only called for 120 size = [1, 159]. */ 121# ifdef USE_AS_STRNLEN 122 /* Use ecx which was computed earlier to compute correct value. 123 */ 124# ifdef USE_AS_WCSLEN 125 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax 126# else 127 subl $(VEC_SIZE * 4 + 1), %ecx 128 addl %ecx, %eax 129# endif 130# else 131 subl %edx, %edi 132 incl %edi 133 addl %edi, %eax 134# endif 135# ifdef USE_AS_WCSLEN 136 /* NB: Divide bytes by 4 to get wchar_t count. */ 137 shrl $2, %eax 138# endif 139 VZEROUPPER_RETURN 140 141 .p2align 4 142L(first_vec_x2): 143 tzcntl %eax, %eax 144 /* Safe to use 32 bit instructions as these are only called for 145 size = [1, 159]. */ 146# ifdef USE_AS_STRNLEN 147 /* Use ecx which was computed earlier to compute correct value. 148 */ 149# ifdef USE_AS_WCSLEN 150 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax 151# else 152 subl $(VEC_SIZE * 3 + 1), %ecx 153 addl %ecx, %eax 154# endif 155# else 156 subl %edx, %edi 157 addl $(VEC_SIZE + 1), %edi 158 addl %edi, %eax 159# endif 160# ifdef USE_AS_WCSLEN 161 /* NB: Divide bytes by 4 to get wchar_t count. */ 162 shrl $2, %eax 163# endif 164 VZEROUPPER_RETURN 165 166 .p2align 4 167L(first_vec_x3): 168 tzcntl %eax, %eax 169 /* Safe to use 32 bit instructions as these are only called for 170 size = [1, 159]. */ 171# ifdef USE_AS_STRNLEN 172 /* Use ecx which was computed earlier to compute correct value. 173 */ 174# ifdef USE_AS_WCSLEN 175 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax 176# else 177 subl $(VEC_SIZE * 2 + 1), %ecx 178 addl %ecx, %eax 179# endif 180# else 181 subl %edx, %edi 182 addl $(VEC_SIZE * 2 + 1), %edi 183 addl %edi, %eax 184# endif 185# ifdef USE_AS_WCSLEN 186 /* NB: Divide bytes by 4 to get wchar_t count. */ 187 shrl $2, %eax 188# endif 189 VZEROUPPER_RETURN 190 191 .p2align 4 192L(first_vec_x4): 193 tzcntl %eax, %eax 194 /* Safe to use 32 bit instructions as these are only called for 195 size = [1, 159]. */ 196# ifdef USE_AS_STRNLEN 197 /* Use ecx which was computed earlier to compute correct value. 198 */ 199# ifdef USE_AS_WCSLEN 200 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax 201# else 202 subl $(VEC_SIZE + 1), %ecx 203 addl %ecx, %eax 204# endif 205# else 206 subl %edx, %edi 207 addl $(VEC_SIZE * 3 + 1), %edi 208 addl %edi, %eax 209# endif 210# ifdef USE_AS_WCSLEN 211 /* NB: Divide bytes by 4 to get wchar_t count. */ 212 shrl $2, %eax 213# endif 214 VZEROUPPER_RETURN 215 216 .p2align 5 217L(aligned_more): 218 /* Align data to VEC_SIZE - 1. This is the same number of 219 instructions as using andq with -VEC_SIZE but saves 4 bytes of 220 code on the x4 check. */ 221 orq $(VEC_SIZE - 1), %rdi 222L(cross_page_continue): 223 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 224 since data is only aligned to VEC_SIZE. */ 225# ifdef USE_AS_STRNLEN 226 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE 227 because it simplies the logic in last_4x_vec_or_less. */ 228 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx 229 subq %rdx, %rcx 230# ifdef USE_AS_WCSLEN 231 /* NB: Divide bytes by 4 to get the wchar_t count. */ 232 sarl $2, %ecx 233# endif 234# endif 235 /* Load first VEC regardless. */ 236 VPCMPEQ 1(%rdi), %ymm0, %ymm1 237# ifdef USE_AS_STRNLEN 238 /* Adjust length. If near end handle specially. */ 239 subq %rcx, %rsi 240 jb L(last_4x_vec_or_less) 241# endif 242 vpmovmskb %ymm1, %eax 243 testl %eax, %eax 244 jnz L(first_vec_x1) 245 246 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 247 vpmovmskb %ymm1, %eax 248 testl %eax, %eax 249 jnz L(first_vec_x2) 250 251 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 252 vpmovmskb %ymm1, %eax 253 testl %eax, %eax 254 jnz L(first_vec_x3) 255 256 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 257 vpmovmskb %ymm1, %eax 258 testl %eax, %eax 259 jnz L(first_vec_x4) 260 261 /* Align data to VEC_SIZE * 4 - 1. */ 262# ifdef USE_AS_STRNLEN 263 /* Before adjusting length check if at last VEC_SIZE * 4. */ 264 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi 265 jbe L(last_4x_vec_or_less_load) 266 incq %rdi 267 movl %edi, %ecx 268 orq $(VEC_SIZE * 4 - 1), %rdi 269 andl $(VEC_SIZE * 4 - 1), %ecx 270# ifdef USE_AS_WCSLEN 271 /* NB: Divide bytes by 4 to get the wchar_t count. */ 272 sarl $2, %ecx 273# endif 274 /* Readjust length. */ 275 addq %rcx, %rsi 276# else 277 incq %rdi 278 orq $(VEC_SIZE * 4 - 1), %rdi 279# endif 280 /* Compare 4 * VEC at a time forward. */ 281 .p2align 4 282L(loop_4x_vec): 283# ifdef USE_AS_STRNLEN 284 /* Break if at end of length. */ 285 subq $(CHAR_PER_VEC * 4), %rsi 286 jb L(last_4x_vec_or_less_cmpeq) 287# endif 288 /* Save some code size by microfusing VPMINU with the load. 289 Since the matches in ymm2/ymm4 can only be returned if there 290 where no matches in ymm1/ymm3 respectively there is no issue 291 with overlap. */ 292 vmovdqa 1(%rdi), %ymm1 293 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 294 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 295 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 296 297 VPMINU %ymm2, %ymm4, %ymm5 298 VPCMPEQ %ymm5, %ymm0, %ymm5 299 vpmovmskb %ymm5, %ecx 300 301 subq $-(VEC_SIZE * 4), %rdi 302 testl %ecx, %ecx 303 jz L(loop_4x_vec) 304 305 306 VPCMPEQ %ymm1, %ymm0, %ymm1 307 vpmovmskb %ymm1, %eax 308 subq %rdx, %rdi 309 testl %eax, %eax 310 jnz L(last_vec_return_x0) 311 312 VPCMPEQ %ymm2, %ymm0, %ymm2 313 vpmovmskb %ymm2, %eax 314 testl %eax, %eax 315 jnz L(last_vec_return_x1) 316 317 /* Combine last 2 VEC. */ 318 VPCMPEQ %ymm3, %ymm0, %ymm3 319 vpmovmskb %ymm3, %eax 320 /* rcx has combined result from all 4 VEC. It will only be used 321 if the first 3 other VEC all did not contain a match. */ 322 salq $32, %rcx 323 orq %rcx, %rax 324 tzcntq %rax, %rax 325 subq $(VEC_SIZE * 2 - 1), %rdi 326 addq %rdi, %rax 327# ifdef USE_AS_WCSLEN 328 /* NB: Divide bytes by 4 to get wchar_t count. */ 329 shrq $2, %rax 330# endif 331 VZEROUPPER_RETURN 332 333 334# ifdef USE_AS_STRNLEN 335 .p2align 4 336L(last_4x_vec_or_less_load): 337 /* Depending on entry adjust rdi / prepare first VEC in ymm1. 338 */ 339 subq $-(VEC_SIZE * 4), %rdi 340L(last_4x_vec_or_less_cmpeq): 341 VPCMPEQ 1(%rdi), %ymm0, %ymm1 342L(last_4x_vec_or_less): 343# ifdef USE_AS_WCSLEN 344 /* NB: Multiply length by 4 to get byte count. */ 345 sall $2, %esi 346# endif 347 vpmovmskb %ymm1, %eax 348 /* If remaining length > VEC_SIZE * 2. This works if esi is off 349 by VEC_SIZE * 4. */ 350 testl $(VEC_SIZE * 2), %esi 351 jnz L(last_4x_vec) 352 353 /* length may have been negative or positive by an offset of 354 VEC_SIZE * 4 depending on where this was called from. This fixes 355 that. */ 356 andl $(VEC_SIZE * 4 - 1), %esi 357 testl %eax, %eax 358 jnz L(last_vec_x1_check) 359 360 subl $VEC_SIZE, %esi 361 jb L(max) 362 363 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 364 vpmovmskb %ymm1, %eax 365 tzcntl %eax, %eax 366 /* Check the end of data. */ 367 cmpl %eax, %esi 368 jb L(max) 369 subq %rdx, %rdi 370 addl $(VEC_SIZE + 1), %eax 371 addq %rdi, %rax 372# ifdef USE_AS_WCSLEN 373 /* NB: Divide bytes by 4 to get wchar_t count. */ 374 shrq $2, %rax 375# endif 376 VZEROUPPER_RETURN 377# endif 378 379 .p2align 4 380L(last_vec_return_x0): 381 tzcntl %eax, %eax 382 subq $(VEC_SIZE * 4 - 1), %rdi 383 addq %rdi, %rax 384# ifdef USE_AS_WCSLEN 385 /* NB: Divide bytes by 4 to get wchar_t count. */ 386 shrq $2, %rax 387# endif 388 VZEROUPPER_RETURN 389 390 .p2align 4 391L(last_vec_return_x1): 392 tzcntl %eax, %eax 393 subq $(VEC_SIZE * 3 - 1), %rdi 394 addq %rdi, %rax 395# ifdef USE_AS_WCSLEN 396 /* NB: Divide bytes by 4 to get wchar_t count. */ 397 shrq $2, %rax 398# endif 399 VZEROUPPER_RETURN 400 401# ifdef USE_AS_STRNLEN 402 .p2align 4 403L(last_vec_x1_check): 404 405 tzcntl %eax, %eax 406 /* Check the end of data. */ 407 cmpl %eax, %esi 408 jb L(max) 409 subq %rdx, %rdi 410 incl %eax 411 addq %rdi, %rax 412# ifdef USE_AS_WCSLEN 413 /* NB: Divide bytes by 4 to get wchar_t count. */ 414 shrq $2, %rax 415# endif 416 VZEROUPPER_RETURN 417 418L(max): 419 movq %r8, %rax 420 VZEROUPPER_RETURN 421 422 .p2align 4 423L(last_4x_vec): 424 /* Test first 2x VEC normally. */ 425 testl %eax, %eax 426 jnz L(last_vec_x1) 427 428 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 429 vpmovmskb %ymm1, %eax 430 testl %eax, %eax 431 jnz L(last_vec_x2) 432 433 /* Normalize length. */ 434 andl $(VEC_SIZE * 4 - 1), %esi 435 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 436 vpmovmskb %ymm1, %eax 437 testl %eax, %eax 438 jnz L(last_vec_x3) 439 440 subl $(VEC_SIZE * 3), %esi 441 jb L(max) 442 443 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 444 vpmovmskb %ymm1, %eax 445 tzcntl %eax, %eax 446 /* Check the end of data. */ 447 cmpl %eax, %esi 448 jb L(max) 449 subq %rdx, %rdi 450 addl $(VEC_SIZE * 3 + 1), %eax 451 addq %rdi, %rax 452# ifdef USE_AS_WCSLEN 453 /* NB: Divide bytes by 4 to get wchar_t count. */ 454 shrq $2, %rax 455# endif 456 VZEROUPPER_RETURN 457 458 459 .p2align 4 460L(last_vec_x1): 461 /* essentially duplicates of first_vec_x1 but use 64 bit 462 instructions. */ 463 tzcntl %eax, %eax 464 subq %rdx, %rdi 465 incl %eax 466 addq %rdi, %rax 467# ifdef USE_AS_WCSLEN 468 /* NB: Divide bytes by 4 to get wchar_t count. */ 469 shrq $2, %rax 470# endif 471 VZEROUPPER_RETURN 472 473 .p2align 4 474L(last_vec_x2): 475 /* essentially duplicates of first_vec_x1 but use 64 bit 476 instructions. */ 477 tzcntl %eax, %eax 478 subq %rdx, %rdi 479 addl $(VEC_SIZE + 1), %eax 480 addq %rdi, %rax 481# ifdef USE_AS_WCSLEN 482 /* NB: Divide bytes by 4 to get wchar_t count. */ 483 shrq $2, %rax 484# endif 485 VZEROUPPER_RETURN 486 487 .p2align 4 488L(last_vec_x3): 489 tzcntl %eax, %eax 490 subl $(VEC_SIZE * 2), %esi 491 /* Check the end of data. */ 492 cmpl %eax, %esi 493 jb L(max_end) 494 subq %rdx, %rdi 495 addl $(VEC_SIZE * 2 + 1), %eax 496 addq %rdi, %rax 497# ifdef USE_AS_WCSLEN 498 /* NB: Divide bytes by 4 to get wchar_t count. */ 499 shrq $2, %rax 500# endif 501 VZEROUPPER_RETURN 502L(max_end): 503 movq %r8, %rax 504 VZEROUPPER_RETURN 505# endif 506 507 /* Cold case for crossing page with first load. */ 508 .p2align 4 509L(cross_page_boundary): 510 /* Align data to VEC_SIZE - 1. */ 511 orq $(VEC_SIZE - 1), %rdi 512 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 513 vpmovmskb %ymm1, %eax 514 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT 515 so no need to manually mod rdx. */ 516 sarxl %edx, %eax, %eax 517# ifdef USE_AS_STRNLEN 518 testl %eax, %eax 519 jnz L(cross_page_less_vec) 520 leaq 1(%rdi), %rcx 521 subq %rdx, %rcx 522# ifdef USE_AS_WCSLEN 523 /* NB: Divide bytes by 4 to get wchar_t count. */ 524 shrl $2, %ecx 525# endif 526 /* Check length. */ 527 cmpq %rsi, %rcx 528 jb L(cross_page_continue) 529 movq %r8, %rax 530# else 531 testl %eax, %eax 532 jz L(cross_page_continue) 533 tzcntl %eax, %eax 534# ifdef USE_AS_WCSLEN 535 /* NB: Divide length by 4 to get wchar_t count. */ 536 shrl $2, %eax 537# endif 538# endif 539L(return_vzeroupper): 540 ZERO_UPPER_VEC_REGISTERS_RETURN 541 542# ifdef USE_AS_STRNLEN 543 .p2align 4 544L(cross_page_less_vec): 545 tzcntl %eax, %eax 546# ifdef USE_AS_WCSLEN 547 /* NB: Multiply length by 4 to get byte count. */ 548 sall $2, %esi 549# endif 550 cmpq %rax, %rsi 551 cmovb %esi, %eax 552# ifdef USE_AS_WCSLEN 553 shrl $2, %eax 554# endif 555 VZEROUPPER_RETURN 556# endif 557 558END (STRLEN) 559#endif 560