1/* strchr/strchrnul optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23# include <sysdep.h> 24 25# ifndef STRCHR 26# define STRCHR __strchr_evex 27# endif 28 29# define VMOVU vmovdqu64 30# define VMOVA vmovdqa64 31 32# ifdef USE_AS_WCSCHR 33# define VPBROADCAST vpbroadcastd 34# define VPCMP vpcmpd 35# define VPTESTN vptestnmd 36# define VPMINU vpminud 37# define CHAR_REG esi 38# define SHIFT_REG ecx 39# define CHAR_SIZE 4 40# else 41# define VPBROADCAST vpbroadcastb 42# define VPCMP vpcmpb 43# define VPTESTN vptestnmb 44# define VPMINU vpminub 45# define CHAR_REG sil 46# define SHIFT_REG edx 47# define CHAR_SIZE 1 48# endif 49 50# define XMMZERO xmm16 51 52# define YMMZERO ymm16 53# define YMM0 ymm17 54# define YMM1 ymm18 55# define YMM2 ymm19 56# define YMM3 ymm20 57# define YMM4 ymm21 58# define YMM5 ymm22 59# define YMM6 ymm23 60# define YMM7 ymm24 61# define YMM8 ymm25 62 63# define VEC_SIZE 32 64# define PAGE_SIZE 4096 65# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 66 67 .section .text.evex,"ax",@progbits 68ENTRY_P2ALIGN (STRCHR, 5) 69 /* Broadcast CHAR to YMM0. */ 70 VPBROADCAST %esi, %YMM0 71 movl %edi, %eax 72 andl $(PAGE_SIZE - 1), %eax 73 /* Check if we cross page boundary with one vector load. 74 Otherwise it is safe to use an unaligned load. */ 75 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 76 ja L(cross_page_boundary) 77 78 /* Check the first VEC_SIZE bytes. Search for both CHAR and the 79 null bytes. */ 80 VMOVU (%rdi), %YMM1 81 82 /* Leaves only CHARS matching esi as 0. */ 83 vpxorq %YMM1, %YMM0, %YMM2 84 VPMINU %YMM2, %YMM1, %YMM2 85 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 86 VPTESTN %YMM2, %YMM2, %k0 87 kmovd %k0, %eax 88 testl %eax, %eax 89 jz L(aligned_more) 90 tzcntl %eax, %eax 91# ifndef USE_AS_STRCHRNUL 92 /* Found CHAR or the null byte. */ 93 cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG 94 /* NB: Use a branch instead of cmovcc here. The expectation is 95 that with strchr the user will branch based on input being 96 null. Since this branch will be 100% predictive of the user 97 branch a branch miss here should save what otherwise would 98 be branch miss in the user code. Otherwise using a branch 1) 99 saves code size and 2) is faster in highly predictable 100 environments. */ 101 jne L(zero) 102# endif 103# ifdef USE_AS_WCSCHR 104 /* NB: Multiply wchar_t count by 4 to get the number of bytes. 105 */ 106 leaq (%rdi, %rax, CHAR_SIZE), %rax 107# else 108 addq %rdi, %rax 109# endif 110 ret 111 112 113 114 .p2align 4,, 10 115L(first_vec_x4): 116# ifndef USE_AS_STRCHRNUL 117 /* Check to see if first match was CHAR (k0) or null (k1). */ 118 kmovd %k0, %eax 119 tzcntl %eax, %eax 120 kmovd %k1, %ecx 121 /* bzhil will not be 0 if first match was null. */ 122 bzhil %eax, %ecx, %ecx 123 jne L(zero) 124# else 125 /* Combine CHAR and null matches. */ 126 kord %k0, %k1, %k0 127 kmovd %k0, %eax 128 tzcntl %eax, %eax 129# endif 130 /* NB: Multiply sizeof char type (1 or 4) to get the number of 131 bytes. */ 132 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 133 ret 134 135# ifndef USE_AS_STRCHRNUL 136L(zero): 137 xorl %eax, %eax 138 ret 139# endif 140 141 142 .p2align 4 143L(first_vec_x1): 144 /* Use bsf here to save 1-byte keeping keeping the block in 1x 145 fetch block. eax guranteed non-zero. */ 146 bsfl %eax, %eax 147# ifndef USE_AS_STRCHRNUL 148 /* Found CHAR or the null byte. */ 149 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 150 jne L(zero) 151 152# endif 153 /* NB: Multiply sizeof char type (1 or 4) to get the number of 154 bytes. */ 155 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax 156 ret 157 158 .p2align 4,, 10 159L(first_vec_x2): 160# ifndef USE_AS_STRCHRNUL 161 /* Check to see if first match was CHAR (k0) or null (k1). */ 162 kmovd %k0, %eax 163 tzcntl %eax, %eax 164 kmovd %k1, %ecx 165 /* bzhil will not be 0 if first match was null. */ 166 bzhil %eax, %ecx, %ecx 167 jne L(zero) 168# else 169 /* Combine CHAR and null matches. */ 170 kord %k0, %k1, %k0 171 kmovd %k0, %eax 172 tzcntl %eax, %eax 173# endif 174 /* NB: Multiply sizeof char type (1 or 4) to get the number of 175 bytes. */ 176 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 177 ret 178 179 .p2align 4,, 10 180L(first_vec_x3): 181 /* Use bsf here to save 1-byte keeping keeping the block in 1x 182 fetch block. eax guranteed non-zero. */ 183 bsfl %eax, %eax 184# ifndef USE_AS_STRCHRNUL 185 /* Found CHAR or the null byte. */ 186 cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 187 jne L(zero) 188# endif 189 /* NB: Multiply sizeof char type (1 or 4) to get the number of 190 bytes. */ 191 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 192 ret 193 194 .p2align 4 195L(aligned_more): 196 /* Align data to VEC_SIZE. */ 197 andq $-VEC_SIZE, %rdi 198L(cross_page_continue): 199 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since 200 data is only aligned to VEC_SIZE. Use two alternating methods 201 for checking VEC to balance latency and port contention. */ 202 203 /* This method has higher latency but has better port 204 distribution. */ 205 VMOVA (VEC_SIZE)(%rdi), %YMM1 206 /* Leaves only CHARS matching esi as 0. */ 207 vpxorq %YMM1, %YMM0, %YMM2 208 VPMINU %YMM2, %YMM1, %YMM2 209 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 210 VPTESTN %YMM2, %YMM2, %k0 211 kmovd %k0, %eax 212 testl %eax, %eax 213 jnz L(first_vec_x1) 214 215 /* This method has higher latency but has better port 216 distribution. */ 217 VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 218 /* Each bit in K0 represents a CHAR in YMM1. */ 219 VPCMP $0, %YMM1, %YMM0, %k0 220 /* Each bit in K1 represents a CHAR in YMM1. */ 221 VPTESTN %YMM1, %YMM1, %k1 222 kortestd %k0, %k1 223 jnz L(first_vec_x2) 224 225 VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 226 /* Leaves only CHARS matching esi as 0. */ 227 vpxorq %YMM1, %YMM0, %YMM2 228 VPMINU %YMM2, %YMM1, %YMM2 229 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 230 VPTESTN %YMM2, %YMM2, %k0 231 kmovd %k0, %eax 232 testl %eax, %eax 233 jnz L(first_vec_x3) 234 235 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 236 /* Each bit in K0 represents a CHAR in YMM1. */ 237 VPCMP $0, %YMM1, %YMM0, %k0 238 /* Each bit in K1 represents a CHAR in YMM1. */ 239 VPTESTN %YMM1, %YMM1, %k1 240 kortestd %k0, %k1 241 jnz L(first_vec_x4) 242 243 /* Align data to VEC_SIZE * 4 for the loop. */ 244 addq $VEC_SIZE, %rdi 245 andq $-(VEC_SIZE * 4), %rdi 246 247 .p2align 4 248L(loop_4x_vec): 249 /* Check 4x VEC at a time. No penalty to imm32 offset with evex 250 encoding. */ 251 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 252 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 253 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 254 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 255 256 /* For YMM1 and YMM3 use xor to set the CHARs matching esi to 257 zero. */ 258 vpxorq %YMM1, %YMM0, %YMM5 259 /* For YMM2 and YMM4 cmp not equals to CHAR and store result in 260 k register. Its possible to save either 1 or 2 instructions 261 using cmp no equals method for either YMM1 or YMM1 and YMM3 262 respectively but bottleneck on p5 makes it not worth it. */ 263 VPCMP $4, %YMM0, %YMM2, %k2 264 vpxorq %YMM3, %YMM0, %YMM7 265 VPCMP $4, %YMM0, %YMM4, %k4 266 267 /* Use min to select all zeros from either xor or end of string). 268 */ 269 VPMINU %YMM1, %YMM5, %YMM1 270 VPMINU %YMM3, %YMM7, %YMM3 271 272 /* Use min + zeromask to select for zeros. Since k2 and k4 will 273 have 0 as positions that matched with CHAR which will set 274 zero in the corresponding destination bytes in YMM2 / YMM4. 275 */ 276 VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} 277 VPMINU %YMM3, %YMM4, %YMM4 278 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} 279 280 VPTESTN %YMM4, %YMM4, %k1 281 kmovd %k1, %ecx 282 subq $-(VEC_SIZE * 4), %rdi 283 testl %ecx, %ecx 284 jz L(loop_4x_vec) 285 286 VPTESTN %YMM1, %YMM1, %k0 287 kmovd %k0, %eax 288 testl %eax, %eax 289 jnz L(last_vec_x1) 290 291 VPTESTN %YMM2, %YMM2, %k0 292 kmovd %k0, %eax 293 testl %eax, %eax 294 jnz L(last_vec_x2) 295 296 VPTESTN %YMM3, %YMM3, %k0 297 kmovd %k0, %eax 298 /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ 299# ifdef USE_AS_WCSCHR 300 sall $8, %ecx 301 orl %ecx, %eax 302 bsfl %eax, %eax 303# else 304 salq $32, %rcx 305 orq %rcx, %rax 306 bsfq %rax, %rax 307# endif 308# ifndef USE_AS_STRCHRNUL 309 /* Check if match was CHAR or null. */ 310 cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 311 jne L(zero_end) 312# endif 313 /* NB: Multiply sizeof char type (1 or 4) to get the number of 314 bytes. */ 315 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 316 ret 317 318 .p2align 4,, 8 319L(last_vec_x1): 320 bsfl %eax, %eax 321# ifdef USE_AS_WCSCHR 322 /* NB: Multiply wchar_t count by 4 to get the number of bytes. 323 */ 324 leaq (%rdi, %rax, CHAR_SIZE), %rax 325# else 326 addq %rdi, %rax 327# endif 328 329# ifndef USE_AS_STRCHRNUL 330 /* Check if match was null. */ 331 cmp (%rax), %CHAR_REG 332 jne L(zero_end) 333# endif 334 335 ret 336 337 .p2align 4,, 8 338L(last_vec_x2): 339 bsfl %eax, %eax 340# ifndef USE_AS_STRCHRNUL 341 /* Check if match was null. */ 342 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 343 jne L(zero_end) 344# endif 345 /* NB: Multiply sizeof char type (1 or 4) to get the number of 346 bytes. */ 347 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax 348 ret 349 350 /* Cold case for crossing page with first load. */ 351 .p2align 4,, 8 352L(cross_page_boundary): 353 movq %rdi, %rdx 354 /* Align rdi. */ 355 andq $-VEC_SIZE, %rdi 356 VMOVA (%rdi), %YMM1 357 /* Leaves only CHARS matching esi as 0. */ 358 vpxorq %YMM1, %YMM0, %YMM2 359 VPMINU %YMM2, %YMM1, %YMM2 360 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 361 VPTESTN %YMM2, %YMM2, %k0 362 kmovd %k0, %eax 363 /* Remove the leading bits. */ 364# ifdef USE_AS_WCSCHR 365 movl %edx, %SHIFT_REG 366 /* NB: Divide shift count by 4 since each bit in K1 represent 4 367 bytes. */ 368 sarl $2, %SHIFT_REG 369 andl $(CHAR_PER_VEC - 1), %SHIFT_REG 370# endif 371 sarxl %SHIFT_REG, %eax, %eax 372 /* If eax is zero continue. */ 373 testl %eax, %eax 374 jz L(cross_page_continue) 375 bsfl %eax, %eax 376 377# ifdef USE_AS_WCSCHR 378 /* NB: Multiply wchar_t count by 4 to get the number of 379 bytes. */ 380 leaq (%rdx, %rax, CHAR_SIZE), %rax 381# else 382 addq %rdx, %rax 383# endif 384# ifndef USE_AS_STRCHRNUL 385 /* Check to see if match was CHAR or null. */ 386 cmp (%rax), %CHAR_REG 387 je L(cross_page_ret) 388L(zero_end): 389 xorl %eax, %eax 390L(cross_page_ret): 391# endif 392 ret 393 394END (STRCHR) 395#endif 396