1/* strrchr optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* ISA level >= 2 because there are no {wcs|str}rchr-sse4 22 implementations. */ 23#if ISA_SHOULD_BUILD (2) 24 25# include <sysdep.h> 26 27# ifndef STRRCHR 28# define STRRCHR __strrchr_sse2 29# endif 30 31# ifdef USE_AS_WCSRCHR 32# define PCMPEQ pcmpeqd 33# define CHAR_SIZE 4 34# define PMINU pminud 35# else 36# define PCMPEQ pcmpeqb 37# define CHAR_SIZE 1 38# define PMINU pminub 39# endif 40 41# define PAGE_SIZE 4096 42# define VEC_SIZE 16 43 44 .text 45ENTRY(STRRCHR) 46 movd %esi, %xmm0 47 movq %rdi, %rax 48 andl $(PAGE_SIZE - 1), %eax 49# ifndef USE_AS_WCSRCHR 50 punpcklbw %xmm0, %xmm0 51 punpcklwd %xmm0, %xmm0 52# endif 53 pshufd $0, %xmm0, %xmm0 54 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 55 ja L(cross_page) 56 57L(cross_page_continue): 58 movups (%rdi), %xmm1 59 pxor %xmm2, %xmm2 60 PCMPEQ %xmm1, %xmm2 61 pmovmskb %xmm2, %ecx 62 testl %ecx, %ecx 63 jz L(aligned_more) 64 65 PCMPEQ %xmm0, %xmm1 66 pmovmskb %xmm1, %eax 67 leal -1(%rcx), %edx 68 xorl %edx, %ecx 69 andl %ecx, %eax 70 jz L(ret0) 71 bsrl %eax, %eax 72 addq %rdi, %rax 73 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If 74 search CHAR is zero we are correct. Either way `andq 75 -CHAR_SIZE, %rax` gets the correct result. */ 76# ifdef USE_AS_WCSRCHR 77 andq $-CHAR_SIZE, %rax 78# endif 79L(ret0): 80 ret 81 82 /* Returns for first vec x1/x2 have hard coded backward search 83 path for earlier matches. */ 84 .p2align 4 85L(first_vec_x0_test): 86 PCMPEQ %xmm0, %xmm1 87 pmovmskb %xmm1, %eax 88 testl %eax, %eax 89 jz L(ret0) 90 bsrl %eax, %eax 91 addq %r8, %rax 92# ifdef USE_AS_WCSRCHR 93 andq $-CHAR_SIZE, %rax 94# endif 95 ret 96 97 .p2align 4 98L(first_vec_x1): 99 PCMPEQ %xmm0, %xmm2 100 pmovmskb %xmm2, %eax 101 leal -1(%rcx), %edx 102 xorl %edx, %ecx 103 andl %ecx, %eax 104 jz L(first_vec_x0_test) 105 bsrl %eax, %eax 106 leaq (VEC_SIZE)(%rdi, %rax), %rax 107# ifdef USE_AS_WCSRCHR 108 andq $-CHAR_SIZE, %rax 109# endif 110 ret 111 112 .p2align 4 113L(first_vec_x1_test): 114 PCMPEQ %xmm0, %xmm2 115 pmovmskb %xmm2, %eax 116 testl %eax, %eax 117 jz L(first_vec_x0_test) 118 bsrl %eax, %eax 119 leaq (VEC_SIZE)(%rdi, %rax), %rax 120# ifdef USE_AS_WCSRCHR 121 andq $-CHAR_SIZE, %rax 122# endif 123 ret 124 125 .p2align 4 126L(first_vec_x2): 127 PCMPEQ %xmm0, %xmm3 128 pmovmskb %xmm3, %eax 129 leal -1(%rcx), %edx 130 xorl %edx, %ecx 131 andl %ecx, %eax 132 jz L(first_vec_x1_test) 133 bsrl %eax, %eax 134 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax 135# ifdef USE_AS_WCSRCHR 136 andq $-CHAR_SIZE, %rax 137# endif 138 ret 139 140 .p2align 4 141L(aligned_more): 142 /* Save original pointer if match was in VEC 0. */ 143 movq %rdi, %r8 144 andq $-VEC_SIZE, %rdi 145 146 movaps VEC_SIZE(%rdi), %xmm2 147 pxor %xmm3, %xmm3 148 PCMPEQ %xmm2, %xmm3 149 pmovmskb %xmm3, %ecx 150 testl %ecx, %ecx 151 jnz L(first_vec_x1) 152 153 movaps (VEC_SIZE * 2)(%rdi), %xmm3 154 pxor %xmm4, %xmm4 155 PCMPEQ %xmm3, %xmm4 156 pmovmskb %xmm4, %ecx 157 testl %ecx, %ecx 158 jnz L(first_vec_x2) 159 160 addq $VEC_SIZE, %rdi 161 /* Save pointer again before realigning. */ 162 movq %rdi, %rsi 163 andq $-(VEC_SIZE * 2), %rdi 164 .p2align 4 165L(first_loop): 166 /* Do 2x VEC at a time. */ 167 movaps (VEC_SIZE * 2)(%rdi), %xmm4 168 movaps (VEC_SIZE * 3)(%rdi), %xmm5 169 /* Since SSE2 no pminud so wcsrchr needs seperate logic for 170 detecting zero. Note if this is found to be a bottleneck it 171 may be worth adding an SSE4.1 wcsrchr implementation. */ 172# ifdef USE_AS_WCSRCHR 173 movaps %xmm5, %xmm6 174 pxor %xmm8, %xmm8 175 176 PCMPEQ %xmm8, %xmm5 177 PCMPEQ %xmm4, %xmm8 178 por %xmm5, %xmm8 179# else 180 movaps %xmm5, %xmm6 181 PMINU %xmm4, %xmm5 182# endif 183 184 movaps %xmm4, %xmm9 185 PCMPEQ %xmm0, %xmm4 186 PCMPEQ %xmm0, %xmm6 187 movaps %xmm6, %xmm7 188 por %xmm4, %xmm6 189# ifndef USE_AS_WCSRCHR 190 pxor %xmm8, %xmm8 191 PCMPEQ %xmm5, %xmm8 192# endif 193 pmovmskb %xmm8, %ecx 194 pmovmskb %xmm6, %eax 195 196 addq $(VEC_SIZE * 2), %rdi 197 /* Use `addl` 1) so we can undo it with `subl` and 2) it can 198 macro-fuse with `jz`. */ 199 addl %ecx, %eax 200 jz L(first_loop) 201 202 /* Check if there is zero match. */ 203 testl %ecx, %ecx 204 jz L(second_loop_match) 205 206 /* Check if there was a match in last iteration. */ 207 subl %ecx, %eax 208 jnz L(new_match) 209 210L(first_loop_old_match): 211 PCMPEQ %xmm0, %xmm2 212 PCMPEQ %xmm0, %xmm3 213 pmovmskb %xmm2, %ecx 214 pmovmskb %xmm3, %eax 215 addl %eax, %ecx 216 jz L(first_vec_x0_test) 217 /* NB: We could move this shift to before the branch and save a 218 bit of code size / performance on the fall through. The 219 branch leads to the null case which generally seems hotter 220 than char in first 3x VEC. */ 221 sall $16, %eax 222 orl %ecx, %eax 223 224 bsrl %eax, %eax 225 addq %rsi, %rax 226# ifdef USE_AS_WCSRCHR 227 andq $-CHAR_SIZE, %rax 228# endif 229 ret 230 231 .p2align 4 232L(new_match): 233 pxor %xmm6, %xmm6 234 PCMPEQ %xmm9, %xmm6 235 pmovmskb %xmm6, %eax 236 sall $16, %ecx 237 orl %eax, %ecx 238 239 /* We can't reuse either of the old comparisons as since we mask 240 of zeros after first zero (instead of using the full 241 comparison) we can't gurantee no interference between match 242 after end of string and valid match. */ 243 pmovmskb %xmm4, %eax 244 pmovmskb %xmm7, %edx 245 sall $16, %edx 246 orl %edx, %eax 247 248 leal -1(%ecx), %edx 249 xorl %edx, %ecx 250 andl %ecx, %eax 251 jz L(first_loop_old_match) 252 bsrl %eax, %eax 253 addq %rdi, %rax 254# ifdef USE_AS_WCSRCHR 255 andq $-CHAR_SIZE, %rax 256# endif 257 ret 258 259 /* Save minimum state for getting most recent match. We can 260 throw out all previous work. */ 261 .p2align 4 262L(second_loop_match): 263 movq %rdi, %rsi 264 movaps %xmm4, %xmm2 265 movaps %xmm7, %xmm3 266 267 .p2align 4 268L(second_loop): 269 movaps (VEC_SIZE * 2)(%rdi), %xmm4 270 movaps (VEC_SIZE * 3)(%rdi), %xmm5 271 /* Since SSE2 no pminud so wcsrchr needs seperate logic for 272 detecting zero. Note if this is found to be a bottleneck it 273 may be worth adding an SSE4.1 wcsrchr implementation. */ 274# ifdef USE_AS_WCSRCHR 275 movaps %xmm5, %xmm6 276 pxor %xmm8, %xmm8 277 278 PCMPEQ %xmm8, %xmm5 279 PCMPEQ %xmm4, %xmm8 280 por %xmm5, %xmm8 281# else 282 movaps %xmm5, %xmm6 283 PMINU %xmm4, %xmm5 284# endif 285 286 movaps %xmm4, %xmm9 287 PCMPEQ %xmm0, %xmm4 288 PCMPEQ %xmm0, %xmm6 289 movaps %xmm6, %xmm7 290 por %xmm4, %xmm6 291# ifndef USE_AS_WCSRCHR 292 pxor %xmm8, %xmm8 293 PCMPEQ %xmm5, %xmm8 294# endif 295 296 pmovmskb %xmm8, %ecx 297 pmovmskb %xmm6, %eax 298 299 addq $(VEC_SIZE * 2), %rdi 300 /* Either null term or new occurence of CHAR. */ 301 addl %ecx, %eax 302 jz L(second_loop) 303 304 /* No null term so much be new occurence of CHAR. */ 305 testl %ecx, %ecx 306 jz L(second_loop_match) 307 308 309 subl %ecx, %eax 310 jnz L(second_loop_new_match) 311 312L(second_loop_old_match): 313 pmovmskb %xmm2, %ecx 314 pmovmskb %xmm3, %eax 315 sall $16, %eax 316 orl %ecx, %eax 317 bsrl %eax, %eax 318 addq %rsi, %rax 319# ifdef USE_AS_WCSRCHR 320 andq $-CHAR_SIZE, %rax 321# endif 322 ret 323 324 .p2align 4 325L(second_loop_new_match): 326 pxor %xmm6, %xmm6 327 PCMPEQ %xmm9, %xmm6 328 pmovmskb %xmm6, %eax 329 sall $16, %ecx 330 orl %eax, %ecx 331 332 /* We can't reuse either of the old comparisons as since we mask 333 of zeros after first zero (instead of using the full 334 comparison) we can't gurantee no interference between match 335 after end of string and valid match. */ 336 pmovmskb %xmm4, %eax 337 pmovmskb %xmm7, %edx 338 sall $16, %edx 339 orl %edx, %eax 340 341 leal -1(%ecx), %edx 342 xorl %edx, %ecx 343 andl %ecx, %eax 344 jz L(second_loop_old_match) 345 bsrl %eax, %eax 346 addq %rdi, %rax 347# ifdef USE_AS_WCSRCHR 348 andq $-CHAR_SIZE, %rax 349# endif 350 ret 351 352 .p2align 4,, 4 353L(cross_page): 354 movq %rdi, %rsi 355 andq $-VEC_SIZE, %rsi 356 movaps (%rsi), %xmm1 357 pxor %xmm2, %xmm2 358 PCMPEQ %xmm1, %xmm2 359 pmovmskb %xmm2, %edx 360 movl %edi, %ecx 361 andl $(VEC_SIZE - 1), %ecx 362 sarl %cl, %edx 363 jz L(cross_page_continue) 364 PCMPEQ %xmm0, %xmm1 365 pmovmskb %xmm1, %eax 366 sarl %cl, %eax 367 leal -1(%rdx), %ecx 368 xorl %edx, %ecx 369 andl %ecx, %eax 370 jz L(ret1) 371 bsrl %eax, %eax 372 addq %rdi, %rax 373# ifdef USE_AS_WCSRCHR 374 andq $-CHAR_SIZE, %rax 375# endif 376L(ret1): 377 ret 378END(STRRCHR) 379#endif 380