1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23# include <sysdep.h> 24 25# ifndef STRLEN 26# define STRLEN __strlen_evex 27# endif 28 29# define VMOVA vmovdqa64 30 31# ifdef USE_AS_WCSLEN 32# define VPCMP vpcmpd 33# define VPMINU vpminud 34# define SHIFT_REG ecx 35# define CHAR_SIZE 4 36# else 37# define VPCMP vpcmpb 38# define VPMINU vpminub 39# define SHIFT_REG edx 40# define CHAR_SIZE 1 41# endif 42 43# define XMMZERO xmm16 44# define YMMZERO ymm16 45# define YMM1 ymm17 46# define YMM2 ymm18 47# define YMM3 ymm19 48# define YMM4 ymm20 49# define YMM5 ymm21 50# define YMM6 ymm22 51 52# define VEC_SIZE 32 53# define PAGE_SIZE 4096 54# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 55 56 .section .text.evex,"ax",@progbits 57ENTRY (STRLEN) 58# ifdef USE_AS_STRNLEN 59 /* Check zero length. */ 60 test %RSI_LP, %RSI_LP 61 jz L(zero) 62# ifdef __ILP32__ 63 /* Clear the upper 32 bits. */ 64 movl %esi, %esi 65# endif 66 mov %RSI_LP, %R8_LP 67# endif 68 movl %edi, %eax 69 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 70 /* Clear high bits from edi. Only keeping bits relevant to page 71 cross check. */ 72 andl $(PAGE_SIZE - 1), %eax 73 /* Check if we may cross page boundary with one vector load. */ 74 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 75 ja L(cross_page_boundary) 76 77 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a 78 null byte. */ 79 VPCMP $0, (%rdi), %YMMZERO, %k0 80 kmovd %k0, %eax 81# ifdef USE_AS_STRNLEN 82 /* If length < CHAR_PER_VEC handle special. */ 83 cmpq $CHAR_PER_VEC, %rsi 84 jbe L(first_vec_x0) 85# endif 86 testl %eax, %eax 87 jz L(aligned_more) 88 tzcntl %eax, %eax 89 ret 90# ifdef USE_AS_STRNLEN 91L(zero): 92 xorl %eax, %eax 93 ret 94 95 .p2align 4 96L(first_vec_x0): 97 /* Set bit for max len so that tzcnt will return min of max len 98 and position of first match. */ 99 btsq %rsi, %rax 100 tzcntl %eax, %eax 101 ret 102# endif 103 104 .p2align 4 105L(first_vec_x1): 106 tzcntl %eax, %eax 107 /* Safe to use 32 bit instructions as these are only called for 108 size = [1, 159]. */ 109# ifdef USE_AS_STRNLEN 110 /* Use ecx which was computed earlier to compute correct value. 111 */ 112 leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax 113# else 114 subl %edx, %edi 115# ifdef USE_AS_WCSLEN 116 /* NB: Divide bytes by 4 to get the wchar_t count. */ 117 sarl $2, %edi 118# endif 119 leal CHAR_PER_VEC(%rdi, %rax), %eax 120# endif 121 ret 122 123 .p2align 4 124L(first_vec_x2): 125 tzcntl %eax, %eax 126 /* Safe to use 32 bit instructions as these are only called for 127 size = [1, 159]. */ 128# ifdef USE_AS_STRNLEN 129 /* Use ecx which was computed earlier to compute correct value. 130 */ 131 leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax 132# else 133 subl %edx, %edi 134# ifdef USE_AS_WCSLEN 135 /* NB: Divide bytes by 4 to get the wchar_t count. */ 136 sarl $2, %edi 137# endif 138 leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax 139# endif 140 ret 141 142 .p2align 4 143L(first_vec_x3): 144 tzcntl %eax, %eax 145 /* Safe to use 32 bit instructions as these are only called for 146 size = [1, 159]. */ 147# ifdef USE_AS_STRNLEN 148 /* Use ecx which was computed earlier to compute correct value. 149 */ 150 leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax 151# else 152 subl %edx, %edi 153# ifdef USE_AS_WCSLEN 154 /* NB: Divide bytes by 4 to get the wchar_t count. */ 155 sarl $2, %edi 156# endif 157 leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax 158# endif 159 ret 160 161 .p2align 4 162L(first_vec_x4): 163 tzcntl %eax, %eax 164 /* Safe to use 32 bit instructions as these are only called for 165 size = [1, 159]. */ 166# ifdef USE_AS_STRNLEN 167 /* Use ecx which was computed earlier to compute correct value. 168 */ 169 leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax 170# else 171 subl %edx, %edi 172# ifdef USE_AS_WCSLEN 173 /* NB: Divide bytes by 4 to get the wchar_t count. */ 174 sarl $2, %edi 175# endif 176 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax 177# endif 178 ret 179 180 .p2align 5 181L(aligned_more): 182 movq %rdi, %rdx 183 /* Align data to VEC_SIZE. */ 184 andq $-(VEC_SIZE), %rdi 185L(cross_page_continue): 186 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 187 since data is only aligned to VEC_SIZE. */ 188# ifdef USE_AS_STRNLEN 189 /* + CHAR_SIZE because it simplies the logic in 190 last_4x_vec_or_less. */ 191 leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx 192 subq %rdx, %rcx 193# ifdef USE_AS_WCSLEN 194 /* NB: Divide bytes by 4 to get the wchar_t count. */ 195 sarl $2, %ecx 196# endif 197# endif 198 /* Load first VEC regardless. */ 199 VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 200# ifdef USE_AS_STRNLEN 201 /* Adjust length. If near end handle specially. */ 202 subq %rcx, %rsi 203 jb L(last_4x_vec_or_less) 204# endif 205 kmovd %k0, %eax 206 testl %eax, %eax 207 jnz L(first_vec_x1) 208 209 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 210 kmovd %k0, %eax 211 test %eax, %eax 212 jnz L(first_vec_x2) 213 214 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 215 kmovd %k0, %eax 216 testl %eax, %eax 217 jnz L(first_vec_x3) 218 219 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 220 kmovd %k0, %eax 221 testl %eax, %eax 222 jnz L(first_vec_x4) 223 224 addq $VEC_SIZE, %rdi 225# ifdef USE_AS_STRNLEN 226 /* Check if at last VEC_SIZE * 4 length. */ 227 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi 228 jbe L(last_4x_vec_or_less_load) 229 movl %edi, %ecx 230 andl $(VEC_SIZE * 4 - 1), %ecx 231# ifdef USE_AS_WCSLEN 232 /* NB: Divide bytes by 4 to get the wchar_t count. */ 233 sarl $2, %ecx 234# endif 235 /* Readjust length. */ 236 addq %rcx, %rsi 237# endif 238 /* Align data to VEC_SIZE * 4. */ 239 andq $-(VEC_SIZE * 4), %rdi 240 241 /* Compare 4 * VEC at a time forward. */ 242 .p2align 4 243L(loop_4x_vec): 244 /* Load first VEC regardless. */ 245 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 246# ifdef USE_AS_STRNLEN 247 /* Break if at end of length. */ 248 subq $(CHAR_PER_VEC * 4), %rsi 249 jb L(last_4x_vec_or_less_cmpeq) 250# endif 251 /* Save some code size by microfusing VPMINU with the load. Since 252 the matches in ymm2/ymm4 can only be returned if there where no 253 matches in ymm1/ymm3 respectively there is no issue with overlap. 254 */ 255 VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 256 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 257 VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 258 259 VPCMP $0, %YMM2, %YMMZERO, %k0 260 VPCMP $0, %YMM4, %YMMZERO, %k1 261 subq $-(VEC_SIZE * 4), %rdi 262 kortestd %k0, %k1 263 jz L(loop_4x_vec) 264 265 /* Check if end was in first half. */ 266 kmovd %k0, %eax 267 subq %rdx, %rdi 268# ifdef USE_AS_WCSLEN 269 shrq $2, %rdi 270# endif 271 testl %eax, %eax 272 jz L(second_vec_return) 273 274 VPCMP $0, %YMM1, %YMMZERO, %k2 275 kmovd %k2, %edx 276 /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ 277# ifdef USE_AS_WCSLEN 278 sall $CHAR_PER_VEC, %eax 279 orl %edx, %eax 280 tzcntl %eax, %eax 281# else 282 salq $CHAR_PER_VEC, %rax 283 orq %rdx, %rax 284 tzcntq %rax, %rax 285# endif 286 addq %rdi, %rax 287 ret 288 289 290# ifdef USE_AS_STRNLEN 291 292L(last_4x_vec_or_less_load): 293 /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ 294 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 295L(last_4x_vec_or_less_cmpeq): 296 VPCMP $0, %YMM1, %YMMZERO, %k0 297 addq $(VEC_SIZE * 3), %rdi 298L(last_4x_vec_or_less): 299 kmovd %k0, %eax 300 /* If remaining length > VEC_SIZE * 2. This works if esi is off by 301 VEC_SIZE * 4. */ 302 testl $(CHAR_PER_VEC * 2), %esi 303 jnz L(last_4x_vec) 304 305 /* length may have been negative or positive by an offset of 306 CHAR_PER_VEC * 4 depending on where this was called from. This 307 fixes that. */ 308 andl $(CHAR_PER_VEC * 4 - 1), %esi 309 testl %eax, %eax 310 jnz L(last_vec_x1_check) 311 312 /* Check the end of data. */ 313 subl $CHAR_PER_VEC, %esi 314 jb L(max) 315 316 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 317 kmovd %k0, %eax 318 tzcntl %eax, %eax 319 /* Check the end of data. */ 320 cmpl %eax, %esi 321 jb L(max) 322 323 subq %rdx, %rdi 324# ifdef USE_AS_WCSLEN 325 /* NB: Divide bytes by 4 to get the wchar_t count. */ 326 sarq $2, %rdi 327# endif 328 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 329 ret 330L(max): 331 movq %r8, %rax 332 ret 333# endif 334 335 /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) 336 in the 4x VEC loop can use 2 byte encoding. */ 337 .p2align 4 338L(second_vec_return): 339 VPCMP $0, %YMM3, %YMMZERO, %k0 340 /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ 341# ifdef USE_AS_WCSLEN 342 kunpckbw %k0, %k1, %k0 343 kmovd %k0, %eax 344 tzcntl %eax, %eax 345# else 346 kunpckdq %k0, %k1, %k0 347 kmovq %k0, %rax 348 tzcntq %rax, %rax 349# endif 350 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 351 ret 352 353 354# ifdef USE_AS_STRNLEN 355L(last_vec_x1_check): 356 tzcntl %eax, %eax 357 /* Check the end of data. */ 358 cmpl %eax, %esi 359 jb L(max) 360 subq %rdx, %rdi 361# ifdef USE_AS_WCSLEN 362 /* NB: Divide bytes by 4 to get the wchar_t count. */ 363 sarq $2, %rdi 364# endif 365 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax 366 ret 367 368 .p2align 4 369L(last_4x_vec): 370 /* Test first 2x VEC normally. */ 371 testl %eax, %eax 372 jnz L(last_vec_x1) 373 374 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 375 kmovd %k0, %eax 376 testl %eax, %eax 377 jnz L(last_vec_x2) 378 379 /* Normalize length. */ 380 andl $(CHAR_PER_VEC * 4 - 1), %esi 381 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 382 kmovd %k0, %eax 383 testl %eax, %eax 384 jnz L(last_vec_x3) 385 386 /* Check the end of data. */ 387 subl $(CHAR_PER_VEC * 3), %esi 388 jb L(max) 389 390 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 391 kmovd %k0, %eax 392 tzcntl %eax, %eax 393 /* Check the end of data. */ 394 cmpl %eax, %esi 395 jb L(max_end) 396 397 subq %rdx, %rdi 398# ifdef USE_AS_WCSLEN 399 /* NB: Divide bytes by 4 to get the wchar_t count. */ 400 sarq $2, %rdi 401# endif 402 leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax 403 ret 404 405 .p2align 4 406L(last_vec_x1): 407 tzcntl %eax, %eax 408 subq %rdx, %rdi 409# ifdef USE_AS_WCSLEN 410 /* NB: Divide bytes by 4 to get the wchar_t count. */ 411 sarq $2, %rdi 412# endif 413 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax 414 ret 415 416 .p2align 4 417L(last_vec_x2): 418 tzcntl %eax, %eax 419 subq %rdx, %rdi 420# ifdef USE_AS_WCSLEN 421 /* NB: Divide bytes by 4 to get the wchar_t count. */ 422 sarq $2, %rdi 423# endif 424 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 425 ret 426 427 .p2align 4 428L(last_vec_x3): 429 tzcntl %eax, %eax 430 subl $(CHAR_PER_VEC * 2), %esi 431 /* Check the end of data. */ 432 cmpl %eax, %esi 433 jb L(max_end) 434 subq %rdx, %rdi 435# ifdef USE_AS_WCSLEN 436 /* NB: Divide bytes by 4 to get the wchar_t count. */ 437 sarq $2, %rdi 438# endif 439 leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax 440 ret 441L(max_end): 442 movq %r8, %rax 443 ret 444# endif 445 446 /* Cold case for crossing page with first load. */ 447 .p2align 4 448L(cross_page_boundary): 449 movq %rdi, %rdx 450 /* Align data to VEC_SIZE. */ 451 andq $-VEC_SIZE, %rdi 452 VPCMP $0, (%rdi), %YMMZERO, %k0 453 kmovd %k0, %eax 454 /* Remove the leading bytes. */ 455# ifdef USE_AS_WCSLEN 456 /* NB: Divide shift count by 4 since each bit in K0 represent 4 457 bytes. */ 458 movl %edx, %ecx 459 shrl $2, %ecx 460 andl $(CHAR_PER_VEC - 1), %ecx 461# endif 462 /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ 463 sarxl %SHIFT_REG, %eax, %eax 464 testl %eax, %eax 465# ifndef USE_AS_STRNLEN 466 jz L(cross_page_continue) 467 tzcntl %eax, %eax 468 ret 469# else 470 jnz L(cross_page_less_vec) 471# ifndef USE_AS_WCSLEN 472 movl %edx, %ecx 473 andl $(CHAR_PER_VEC - 1), %ecx 474# endif 475 movl $CHAR_PER_VEC, %eax 476 subl %ecx, %eax 477 /* Check the end of data. */ 478 cmpq %rax, %rsi 479 ja L(cross_page_continue) 480 movl %esi, %eax 481 ret 482L(cross_page_less_vec): 483 tzcntl %eax, %eax 484 /* Select min of length and position of first null. */ 485 cmpq %rax, %rsi 486 cmovb %esi, %eax 487 ret 488# endif 489 490END (STRLEN) 491#endif 492