1/* strrchr/wcsrchr optimized with AVX2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23# include <sysdep.h> 24 25# ifndef STRRCHR 26# define STRRCHR __strrchr_avx2 27# endif 28 29# ifdef USE_AS_WCSRCHR 30# define VPBROADCAST vpbroadcastd 31# define VPCMPEQ vpcmpeqd 32# define VPMIN vpminud 33# define CHAR_SIZE 4 34# else 35# define VPBROADCAST vpbroadcastb 36# define VPCMPEQ vpcmpeqb 37# define VPMIN vpminub 38# define CHAR_SIZE 1 39# endif 40 41# ifndef VZEROUPPER 42# define VZEROUPPER vzeroupper 43# endif 44 45# ifndef SECTION 46# define SECTION(p) p##.avx 47# endif 48 49# define VEC_SIZE 32 50# define PAGE_SIZE 4096 51 52 .section SECTION(.text), "ax", @progbits 53ENTRY(STRRCHR) 54 vmovd %esi, %xmm7 55 movl %edi, %eax 56 /* Broadcast CHAR to YMM4. */ 57 VPBROADCAST %xmm7, %ymm7 58 vpxor %xmm0, %xmm0, %xmm0 59 60 /* Shift here instead of `andl` to save code size (saves a fetch 61 block). */ 62 sall $20, %eax 63 cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax 64 ja L(cross_page) 65 66L(page_cross_continue): 67 vmovdqu (%rdi), %ymm1 68 /* Check end of string match. */ 69 VPCMPEQ %ymm1, %ymm0, %ymm6 70 vpmovmskb %ymm6, %ecx 71 testl %ecx, %ecx 72 jz L(aligned_more) 73 74 /* Only check match with search CHAR if needed. */ 75 VPCMPEQ %ymm1, %ymm7, %ymm1 76 vpmovmskb %ymm1, %eax 77 /* Check if match before first zero. */ 78 blsmskl %ecx, %ecx 79 andl %ecx, %eax 80 jz L(ret0) 81 bsrl %eax, %eax 82 addq %rdi, %rax 83 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If 84 search CHAR is zero we are correct. Either way `andq 85 -CHAR_SIZE, %rax` gets the correct result. */ 86# ifdef USE_AS_WCSRCHR 87 andq $-CHAR_SIZE, %rax 88# endif 89L(ret0): 90L(return_vzeroupper): 91 ZERO_UPPER_VEC_REGISTERS_RETURN 92 93 /* Returns for first vec x1/x2 have hard coded backward search 94 path for earlier matches. */ 95 .p2align 4,, 10 96L(first_vec_x1): 97 VPCMPEQ %ymm2, %ymm7, %ymm6 98 vpmovmskb %ymm6, %eax 99 blsmskl %ecx, %ecx 100 andl %ecx, %eax 101 jnz L(first_vec_x1_return) 102 103 .p2align 4,, 4 104L(first_vec_x0_test): 105 VPCMPEQ %ymm1, %ymm7, %ymm6 106 vpmovmskb %ymm6, %eax 107 testl %eax, %eax 108 jz L(ret1) 109 bsrl %eax, %eax 110 addq %r8, %rax 111# ifdef USE_AS_WCSRCHR 112 andq $-CHAR_SIZE, %rax 113# endif 114L(ret1): 115 VZEROUPPER_RETURN 116 117 .p2align 4,, 10 118L(first_vec_x0_x1_test): 119 VPCMPEQ %ymm2, %ymm7, %ymm6 120 vpmovmskb %ymm6, %eax 121 /* Check ymm2 for search CHAR match. If no match then check ymm1 122 before returning. */ 123 testl %eax, %eax 124 jz L(first_vec_x0_test) 125 .p2align 4,, 4 126L(first_vec_x1_return): 127 bsrl %eax, %eax 128 leaq 1(%rdi, %rax), %rax 129# ifdef USE_AS_WCSRCHR 130 andq $-CHAR_SIZE, %rax 131# endif 132 VZEROUPPER_RETURN 133 134 135 .p2align 4,, 10 136L(first_vec_x2): 137 VPCMPEQ %ymm3, %ymm7, %ymm6 138 vpmovmskb %ymm6, %eax 139 blsmskl %ecx, %ecx 140 /* If no in-range search CHAR match in ymm3 then need to check 141 ymm1/ymm2 for an earlier match (we delay checking search 142 CHAR matches until needed). */ 143 andl %ecx, %eax 144 jz L(first_vec_x0_x1_test) 145 bsrl %eax, %eax 146 leaq (VEC_SIZE + 1)(%rdi, %rax), %rax 147# ifdef USE_AS_WCSRCHR 148 andq $-CHAR_SIZE, %rax 149# endif 150 VZEROUPPER_RETURN 151 152 153 .p2align 4 154L(aligned_more): 155 /* Save original pointer if match was in VEC 0. */ 156 movq %rdi, %r8 157 158 /* Align src. */ 159 orq $(VEC_SIZE - 1), %rdi 160 vmovdqu 1(%rdi), %ymm2 161 VPCMPEQ %ymm2, %ymm0, %ymm6 162 vpmovmskb %ymm6, %ecx 163 testl %ecx, %ecx 164 jnz L(first_vec_x1) 165 166 vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 167 VPCMPEQ %ymm3, %ymm0, %ymm6 168 vpmovmskb %ymm6, %ecx 169 testl %ecx, %ecx 170 jnz L(first_vec_x2) 171 172 /* Save pointer again before realigning. */ 173 movq %rdi, %rsi 174 addq $(VEC_SIZE + 1), %rdi 175 andq $-(VEC_SIZE * 2), %rdi 176 .p2align 4 177L(first_aligned_loop): 178 /* Do 2x VEC at a time. Any more and the cost of finding the 179 match outweights loop benefit. */ 180 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 181 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 182 183 VPCMPEQ %ymm4, %ymm7, %ymm6 184 VPMIN %ymm4, %ymm5, %ymm8 185 VPCMPEQ %ymm5, %ymm7, %ymm10 186 vpor %ymm6, %ymm10, %ymm5 187 VPCMPEQ %ymm8, %ymm0, %ymm8 188 vpor %ymm5, %ymm8, %ymm9 189 190 vpmovmskb %ymm9, %eax 191 addq $(VEC_SIZE * 2), %rdi 192 /* No zero or search CHAR. */ 193 testl %eax, %eax 194 jz L(first_aligned_loop) 195 196 /* If no zero CHAR then go to second loop (this allows us to 197 throw away all prior work). */ 198 vpmovmskb %ymm8, %ecx 199 testl %ecx, %ecx 200 jz L(second_aligned_loop_prep) 201 202 /* Search char could be zero so we need to get the true match. 203 */ 204 vpmovmskb %ymm5, %eax 205 testl %eax, %eax 206 jnz L(first_aligned_loop_return) 207 208 .p2align 4,, 4 209L(first_vec_x1_or_x2): 210 VPCMPEQ %ymm3, %ymm7, %ymm3 211 VPCMPEQ %ymm2, %ymm7, %ymm2 212 vpmovmskb %ymm3, %eax 213 vpmovmskb %ymm2, %edx 214 /* Use add for macro-fusion. */ 215 addq %rax, %rdx 216 jz L(first_vec_x0_test) 217 /* NB: We could move this shift to before the branch and save a 218 bit of code size / performance on the fall through. The 219 branch leads to the null case which generally seems hotter 220 than char in first 3x VEC. */ 221 salq $32, %rax 222 addq %rdx, %rax 223 bsrq %rax, %rax 224 leaq 1(%rsi, %rax), %rax 225# ifdef USE_AS_WCSRCHR 226 andq $-CHAR_SIZE, %rax 227# endif 228 VZEROUPPER_RETURN 229 230 .p2align 4,, 8 231L(first_aligned_loop_return): 232 VPCMPEQ %ymm4, %ymm0, %ymm4 233 vpmovmskb %ymm4, %edx 234 salq $32, %rcx 235 orq %rdx, %rcx 236 237 vpmovmskb %ymm10, %eax 238 vpmovmskb %ymm6, %edx 239 salq $32, %rax 240 orq %rdx, %rax 241 blsmskq %rcx, %rcx 242 andq %rcx, %rax 243 jz L(first_vec_x1_or_x2) 244 245 bsrq %rax, %rax 246 leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax 247# ifdef USE_AS_WCSRCHR 248 andq $-CHAR_SIZE, %rax 249# endif 250 VZEROUPPER_RETURN 251 252 /* Search char cannot be zero. */ 253 .p2align 4 254L(second_aligned_loop_set_furthest_match): 255 /* Save VEC and pointer from most recent match. */ 256L(second_aligned_loop_prep): 257 movq %rdi, %rsi 258 vmovdqu %ymm6, %ymm2 259 vmovdqu %ymm10, %ymm3 260 261 .p2align 4 262L(second_aligned_loop): 263 /* Search 2x at at time. */ 264 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 265 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 266 267 VPCMPEQ %ymm4, %ymm7, %ymm6 268 VPMIN %ymm4, %ymm5, %ymm1 269 VPCMPEQ %ymm5, %ymm7, %ymm10 270 vpor %ymm6, %ymm10, %ymm5 271 VPCMPEQ %ymm1, %ymm0, %ymm1 272 vpor %ymm5, %ymm1, %ymm9 273 274 vpmovmskb %ymm9, %eax 275 addq $(VEC_SIZE * 2), %rdi 276 testl %eax, %eax 277 jz L(second_aligned_loop) 278 vpmovmskb %ymm1, %ecx 279 testl %ecx, %ecx 280 jz L(second_aligned_loop_set_furthest_match) 281 vpmovmskb %ymm5, %eax 282 testl %eax, %eax 283 jnz L(return_new_match) 284 285 /* This is the hot patch. We know CHAR is inbounds and that 286 ymm3/ymm2 have latest match. */ 287 .p2align 4,, 4 288L(return_old_match): 289 vpmovmskb %ymm3, %eax 290 vpmovmskb %ymm2, %edx 291 salq $32, %rax 292 orq %rdx, %rax 293 bsrq %rax, %rax 294 /* Search char cannot be zero so safe to just use lea for 295 wcsrchr. */ 296 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax 297 VZEROUPPER_RETURN 298 299 /* Last iteration also potentially has a match. */ 300 .p2align 4,, 8 301L(return_new_match): 302 VPCMPEQ %ymm4, %ymm0, %ymm4 303 vpmovmskb %ymm4, %edx 304 salq $32, %rcx 305 orq %rdx, %rcx 306 307 vpmovmskb %ymm10, %eax 308 vpmovmskb %ymm6, %edx 309 salq $32, %rax 310 orq %rdx, %rax 311 blsmskq %rcx, %rcx 312 andq %rcx, %rax 313 jz L(return_old_match) 314 bsrq %rax, %rax 315 /* Search char cannot be zero so safe to just use lea for 316 wcsrchr. */ 317 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax 318 VZEROUPPER_RETURN 319 320 .p2align 4,, 4 321L(cross_page): 322 movq %rdi, %rsi 323 andq $-VEC_SIZE, %rsi 324 vmovdqu (%rsi), %ymm1 325 VPCMPEQ %ymm1, %ymm0, %ymm6 326 vpmovmskb %ymm6, %ecx 327 /* Shift out zero CHAR matches that are before the begining of 328 src (rdi). */ 329 shrxl %edi, %ecx, %ecx 330 testl %ecx, %ecx 331 jz L(page_cross_continue) 332 VPCMPEQ %ymm1, %ymm7, %ymm1 333 vpmovmskb %ymm1, %eax 334 335 /* Shift out search CHAR matches that are before the begining of 336 src (rdi). */ 337 shrxl %edi, %eax, %eax 338 blsmskl %ecx, %ecx 339 /* Check if any search CHAR match in range. */ 340 andl %ecx, %eax 341 jz L(ret2) 342 bsrl %eax, %eax 343 addq %rdi, %rax 344# ifdef USE_AS_WCSRCHR 345 andq $-CHAR_SIZE, %rax 346# endif 347L(ret2): 348 VZEROUPPER_RETURN 349END(STRRCHR) 350#endif 351