1/* strchr/strchrnul optimized with AVX2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23# include <sysdep.h> 24 25# ifndef STRCHR 26# define STRCHR __strchr_avx2 27# endif 28 29# ifdef USE_AS_WCSCHR 30# define VPBROADCAST vpbroadcastd 31# define VPCMPEQ vpcmpeqd 32# define VPMINU vpminud 33# define CHAR_REG esi 34# else 35# define VPBROADCAST vpbroadcastb 36# define VPCMPEQ vpcmpeqb 37# define VPMINU vpminub 38# define CHAR_REG sil 39# endif 40 41# ifndef VZEROUPPER 42# define VZEROUPPER vzeroupper 43# endif 44 45# ifndef SECTION 46# define SECTION(p) p##.avx 47# endif 48 49# define VEC_SIZE 32 50# define PAGE_SIZE 4096 51 52 .section SECTION(.text),"ax",@progbits 53ENTRY_P2ALIGN (STRCHR, 5) 54 /* Broadcast CHAR to YMM0. */ 55 vmovd %esi, %xmm0 56 movl %edi, %eax 57 andl $(PAGE_SIZE - 1), %eax 58 VPBROADCAST %xmm0, %ymm0 59 vpxor %xmm1, %xmm1, %xmm1 60 61 /* Check if we cross page boundary with one vector load. */ 62 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 63 ja L(cross_page_boundary) 64 65 /* Check the first VEC_SIZE bytes. Search for both CHAR and the 66 null byte. */ 67 vmovdqu (%rdi), %ymm2 68 VPCMPEQ %ymm2, %ymm0, %ymm3 69 VPCMPEQ %ymm2, %ymm1, %ymm2 70 vpor %ymm3, %ymm2, %ymm3 71 vpmovmskb %ymm3, %eax 72 testl %eax, %eax 73 jz L(aligned_more) 74 tzcntl %eax, %eax 75# ifndef USE_AS_STRCHRNUL 76 /* Found CHAR or the null byte. */ 77 cmp (%rdi, %rax), %CHAR_REG 78 /* NB: Use a branch instead of cmovcc here. The expectation is 79 that with strchr the user will branch based on input being 80 null. Since this branch will be 100% predictive of the user 81 branch a branch miss here should save what otherwise would 82 be branch miss in the user code. Otherwise using a branch 1) 83 saves code size and 2) is faster in highly predictable 84 environments. */ 85 jne L(zero) 86# endif 87 addq %rdi, %rax 88L(return_vzeroupper): 89 ZERO_UPPER_VEC_REGISTERS_RETURN 90 91# ifndef USE_AS_STRCHRNUL 92L(zero): 93 xorl %eax, %eax 94 VZEROUPPER_RETURN 95# endif 96 97 98 .p2align 4 99L(first_vec_x1): 100 /* Use bsf to save code size. */ 101 bsfl %eax, %eax 102 incq %rdi 103# ifndef USE_AS_STRCHRNUL 104 /* Found CHAR or the null byte. */ 105 cmp (%rdi, %rax), %CHAR_REG 106 jne L(zero) 107# endif 108 addq %rdi, %rax 109 VZEROUPPER_RETURN 110 111 .p2align 4,, 10 112L(first_vec_x2): 113 /* Use bsf to save code size. */ 114 bsfl %eax, %eax 115 addq $(VEC_SIZE + 1), %rdi 116# ifndef USE_AS_STRCHRNUL 117 /* Found CHAR or the null byte. */ 118 cmp (%rdi, %rax), %CHAR_REG 119 jne L(zero) 120# endif 121 addq %rdi, %rax 122 VZEROUPPER_RETURN 123 124 .p2align 4,, 8 125L(first_vec_x3): 126 /* Use bsf to save code size. */ 127 bsfl %eax, %eax 128 addq $(VEC_SIZE * 2 + 1), %rdi 129# ifndef USE_AS_STRCHRNUL 130 /* Found CHAR or the null byte. */ 131 cmp (%rdi, %rax), %CHAR_REG 132 jne L(zero) 133# endif 134 addq %rdi, %rax 135 VZEROUPPER_RETURN 136 137 .p2align 4,, 10 138L(first_vec_x4): 139 /* Use bsf to save code size. */ 140 bsfl %eax, %eax 141 addq $(VEC_SIZE * 3 + 1), %rdi 142# ifndef USE_AS_STRCHRNUL 143 /* Found CHAR or the null byte. */ 144 cmp (%rdi, %rax), %CHAR_REG 145 jne L(zero) 146# endif 147 addq %rdi, %rax 148 VZEROUPPER_RETURN 149 150 151 152 .p2align 4 153L(aligned_more): 154 /* Align data to VEC_SIZE - 1. This is the same number of 155 instructions as using andq -VEC_SIZE but saves 4 bytes of code 156 on x4 check. */ 157 orq $(VEC_SIZE - 1), %rdi 158L(cross_page_continue): 159 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time 160 since data is only aligned to VEC_SIZE. */ 161 vmovdqa 1(%rdi), %ymm2 162 VPCMPEQ %ymm2, %ymm0, %ymm3 163 VPCMPEQ %ymm2, %ymm1, %ymm2 164 vpor %ymm3, %ymm2, %ymm3 165 vpmovmskb %ymm3, %eax 166 testl %eax, %eax 167 jnz L(first_vec_x1) 168 169 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 170 VPCMPEQ %ymm2, %ymm0, %ymm3 171 VPCMPEQ %ymm2, %ymm1, %ymm2 172 vpor %ymm3, %ymm2, %ymm3 173 vpmovmskb %ymm3, %eax 174 testl %eax, %eax 175 jnz L(first_vec_x2) 176 177 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 178 VPCMPEQ %ymm2, %ymm0, %ymm3 179 VPCMPEQ %ymm2, %ymm1, %ymm2 180 vpor %ymm3, %ymm2, %ymm3 181 vpmovmskb %ymm3, %eax 182 testl %eax, %eax 183 jnz L(first_vec_x3) 184 185 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 186 VPCMPEQ %ymm2, %ymm0, %ymm3 187 VPCMPEQ %ymm2, %ymm1, %ymm2 188 vpor %ymm3, %ymm2, %ymm3 189 vpmovmskb %ymm3, %eax 190 testl %eax, %eax 191 jnz L(first_vec_x4) 192 /* Align data to VEC_SIZE * 4 - 1. */ 193 incq %rdi 194 orq $(VEC_SIZE * 4 - 1), %rdi 195 .p2align 4 196L(loop_4x_vec): 197 /* Compare 4 * VEC at a time forward. */ 198 vmovdqa 1(%rdi), %ymm6 199 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 200 201 /* Leaves only CHARS matching esi as 0. */ 202 vpxor %ymm6, %ymm0, %ymm2 203 vpxor %ymm7, %ymm0, %ymm3 204 205 VPMINU %ymm2, %ymm6, %ymm2 206 VPMINU %ymm3, %ymm7, %ymm3 207 208 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 209 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 210 211 vpxor %ymm6, %ymm0, %ymm4 212 vpxor %ymm7, %ymm0, %ymm5 213 214 VPMINU %ymm4, %ymm6, %ymm4 215 VPMINU %ymm5, %ymm7, %ymm5 216 217 VPMINU %ymm2, %ymm3, %ymm6 218 VPMINU %ymm4, %ymm5, %ymm7 219 220 VPMINU %ymm6, %ymm7, %ymm7 221 222 VPCMPEQ %ymm7, %ymm1, %ymm7 223 vpmovmskb %ymm7, %ecx 224 subq $-(VEC_SIZE * 4), %rdi 225 testl %ecx, %ecx 226 jz L(loop_4x_vec) 227 228 VPCMPEQ %ymm2, %ymm1, %ymm2 229 vpmovmskb %ymm2, %eax 230 testl %eax, %eax 231 jnz L(last_vec_x0) 232 233 234 VPCMPEQ %ymm3, %ymm1, %ymm3 235 vpmovmskb %ymm3, %eax 236 testl %eax, %eax 237 jnz L(last_vec_x1) 238 239 VPCMPEQ %ymm4, %ymm1, %ymm4 240 vpmovmskb %ymm4, %eax 241 /* rcx has combined result from all 4 VEC. It will only be used 242 if the first 3 other VEC all did not contain a match. */ 243 salq $32, %rcx 244 orq %rcx, %rax 245 tzcntq %rax, %rax 246 subq $(VEC_SIZE * 2 - 1), %rdi 247# ifndef USE_AS_STRCHRNUL 248 /* Found CHAR or the null byte. */ 249 cmp (%rdi, %rax), %CHAR_REG 250 jne L(zero_end) 251# endif 252 addq %rdi, %rax 253 VZEROUPPER_RETURN 254 255 256 .p2align 4,, 10 257L(last_vec_x0): 258 /* Use bsf to save code size. */ 259 bsfl %eax, %eax 260 addq $-(VEC_SIZE * 4 - 1), %rdi 261# ifndef USE_AS_STRCHRNUL 262 /* Found CHAR or the null byte. */ 263 cmp (%rdi, %rax), %CHAR_REG 264 jne L(zero_end) 265# endif 266 addq %rdi, %rax 267 VZEROUPPER_RETURN 268 269 270 .p2align 4,, 10 271L(last_vec_x1): 272 tzcntl %eax, %eax 273 subq $(VEC_SIZE * 3 - 1), %rdi 274# ifndef USE_AS_STRCHRNUL 275 /* Found CHAR or the null byte. */ 276 cmp (%rdi, %rax), %CHAR_REG 277 jne L(zero_end) 278# endif 279 addq %rdi, %rax 280 VZEROUPPER_RETURN 281 282# ifndef USE_AS_STRCHRNUL 283L(zero_end): 284 xorl %eax, %eax 285 VZEROUPPER_RETURN 286# endif 287 288 /* Cold case for crossing page with first load. */ 289 .p2align 4,, 8 290L(cross_page_boundary): 291 movq %rdi, %rdx 292 /* Align rdi to VEC_SIZE - 1. */ 293 orq $(VEC_SIZE - 1), %rdi 294 vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 295 VPCMPEQ %ymm2, %ymm0, %ymm3 296 VPCMPEQ %ymm2, %ymm1, %ymm2 297 vpor %ymm3, %ymm2, %ymm3 298 vpmovmskb %ymm3, %eax 299 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT 300 so no need to manually mod edx. */ 301 sarxl %edx, %eax, %eax 302 testl %eax, %eax 303 jz L(cross_page_continue) 304 tzcntl %eax, %eax 305# ifndef USE_AS_STRCHRNUL 306 xorl %ecx, %ecx 307 /* Found CHAR or the null byte. */ 308 cmp (%rdx, %rax), %CHAR_REG 309 jne L(zero_end) 310# endif 311 addq %rdx, %rax 312 VZEROUPPER_RETURN 313 314END (STRCHR) 315#endif 316