1/* Optimized memrchr with sse2 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# define CFI_PUSH(REG) \ 24 cfi_adjust_cfa_offset (4); \ 25 cfi_rel_offset (REG, 0) 26 27# define CFI_POP(REG) \ 28 cfi_adjust_cfa_offset (-4); \ 29 cfi_restore (REG) 30 31# define PUSH(REG) pushl REG; CFI_PUSH (REG) 32# define POP(REG) popl REG; CFI_POP (REG) 33 34# define PARMS 4 35# define STR1 PARMS 36# define STR2 STR1+4 37# define LEN STR2+4 38 39# define MEMCHR __memrchr_sse2_bsf 40 41 .text 42ENTRY (MEMCHR) 43 mov STR1(%esp), %ecx 44 movd STR2(%esp), %xmm1 45 mov LEN(%esp), %edx 46 47 sub $16, %edx 48 jbe L(length_less16) 49 50 punpcklbw %xmm1, %xmm1 51 add %edx, %ecx 52 punpcklbw %xmm1, %xmm1 53 54 movdqu (%ecx), %xmm0 55 pshufd $0, %xmm1, %xmm1 56 pcmpeqb %xmm1, %xmm0 57 58/* Check if there is a match. */ 59 pmovmskb %xmm0, %eax 60 test %eax, %eax 61 jnz L(matches0) 62 63 sub $64, %ecx 64 mov %ecx, %eax 65 and $15, %eax 66 jz L(loop_prolog) 67 68 add $16, %ecx 69 add $16, %edx 70 sub %eax, %ecx 71 sub %eax, %edx 72 73 .p2align 4 74/* Loop start on aligned string. */ 75L(loop_prolog): 76 sub $64, %edx 77 jbe L(exit_loop) 78 79 movdqa 48(%ecx), %xmm0 80 pcmpeqb %xmm1, %xmm0 81 pmovmskb %xmm0, %eax 82 test %eax, %eax 83 jnz L(matches48) 84 85 movdqa 32(%ecx), %xmm2 86 pcmpeqb %xmm1, %xmm2 87 pmovmskb %xmm2, %eax 88 test %eax, %eax 89 jnz L(matches32) 90 91 movdqa 16(%ecx), %xmm3 92 pcmpeqb %xmm1, %xmm3 93 pmovmskb %xmm3, %eax 94 test %eax, %eax 95 jnz L(matches16) 96 97 movdqa (%ecx), %xmm4 98 pcmpeqb %xmm1, %xmm4 99 pmovmskb %xmm4, %eax 100 test %eax, %eax 101 jnz L(matches0) 102 103 sub $64, %ecx 104 sub $64, %edx 105 jbe L(exit_loop) 106 107 movdqa 48(%ecx), %xmm0 108 pcmpeqb %xmm1, %xmm0 109 pmovmskb %xmm0, %eax 110 test %eax, %eax 111 jnz L(matches48) 112 113 movdqa 32(%ecx), %xmm2 114 pcmpeqb %xmm1, %xmm2 115 pmovmskb %xmm2, %eax 116 test %eax, %eax 117 jnz L(matches32) 118 119 movdqa 16(%ecx), %xmm3 120 pcmpeqb %xmm1, %xmm3 121 pmovmskb %xmm3, %eax 122 test %eax, %eax 123 jnz L(matches16) 124 125 movdqa (%ecx), %xmm3 126 pcmpeqb %xmm1, %xmm3 127 pmovmskb %xmm3, %eax 128 test %eax, %eax 129 jnz L(matches0) 130 131 mov %ecx, %eax 132 and $63, %eax 133 test %eax, %eax 134 jz L(align64_loop) 135 136 add $64, %ecx 137 add $64, %edx 138 sub %eax, %ecx 139 sub %eax, %edx 140 141 .p2align 4 142L(align64_loop): 143 sub $64, %ecx 144 sub $64, %edx 145 jbe L(exit_loop) 146 147 movdqa (%ecx), %xmm0 148 movdqa 16(%ecx), %xmm2 149 movdqa 32(%ecx), %xmm3 150 movdqa 48(%ecx), %xmm4 151 152 pcmpeqb %xmm1, %xmm0 153 pcmpeqb %xmm1, %xmm2 154 pcmpeqb %xmm1, %xmm3 155 pcmpeqb %xmm1, %xmm4 156 157 pmaxub %xmm3, %xmm0 158 pmaxub %xmm4, %xmm2 159 pmaxub %xmm0, %xmm2 160 pmovmskb %xmm2, %eax 161 162 test %eax, %eax 163 jz L(align64_loop) 164 165 pmovmskb %xmm4, %eax 166 test %eax, %eax 167 jnz L(matches48) 168 169 pmovmskb %xmm3, %eax 170 test %eax, %eax 171 jnz L(matches32) 172 173 movdqa 16(%ecx), %xmm2 174 175 pcmpeqb %xmm1, %xmm2 176 pcmpeqb (%ecx), %xmm1 177 178 pmovmskb %xmm2, %eax 179 test %eax, %eax 180 jnz L(matches16) 181 182 pmovmskb %xmm1, %eax 183 bsr %eax, %eax 184 185 add %ecx, %eax 186 ret 187 188 .p2align 4 189L(exit_loop): 190 add $64, %edx 191 cmp $32, %edx 192 jbe L(exit_loop_32) 193 194 movdqa 48(%ecx), %xmm0 195 pcmpeqb %xmm1, %xmm0 196 pmovmskb %xmm0, %eax 197 test %eax, %eax 198 jnz L(matches48) 199 200 movdqa 32(%ecx), %xmm2 201 pcmpeqb %xmm1, %xmm2 202 pmovmskb %xmm2, %eax 203 test %eax, %eax 204 jnz L(matches32) 205 206 movdqa 16(%ecx), %xmm3 207 pcmpeqb %xmm1, %xmm3 208 pmovmskb %xmm3, %eax 209 test %eax, %eax 210 jnz L(matches16_1) 211 cmp $48, %edx 212 jbe L(return_null) 213 214 pcmpeqb (%ecx), %xmm1 215 pmovmskb %xmm1, %eax 216 test %eax, %eax 217 jnz L(matches0_1) 218 xor %eax, %eax 219 ret 220 221 .p2align 4 222L(exit_loop_32): 223 movdqa 48(%ecx), %xmm0 224 pcmpeqb %xmm1, %xmm0 225 pmovmskb %xmm0, %eax 226 test %eax, %eax 227 jnz L(matches48_1) 228 cmp $16, %edx 229 jbe L(return_null) 230 231 pcmpeqb 32(%ecx), %xmm1 232 pmovmskb %xmm1, %eax 233 test %eax, %eax 234 jnz L(matches32_1) 235 xor %eax, %eax 236 ret 237 238 .p2align 4 239L(matches0): 240 bsr %eax, %eax 241 add %ecx, %eax 242 ret 243 244 .p2align 4 245L(matches16): 246 bsr %eax, %eax 247 lea 16(%eax, %ecx), %eax 248 ret 249 250 .p2align 4 251L(matches32): 252 bsr %eax, %eax 253 lea 32(%eax, %ecx), %eax 254 ret 255 256 .p2align 4 257L(matches48): 258 bsr %eax, %eax 259 lea 48(%eax, %ecx), %eax 260 ret 261 262 .p2align 4 263L(matches0_1): 264 bsr %eax, %eax 265 sub $64, %edx 266 add %eax, %edx 267 jl L(return_null) 268 add %ecx, %eax 269 ret 270 271 .p2align 4 272L(matches16_1): 273 bsr %eax, %eax 274 sub $48, %edx 275 add %eax, %edx 276 jl L(return_null) 277 lea 16(%ecx, %eax), %eax 278 ret 279 280 .p2align 4 281L(matches32_1): 282 bsr %eax, %eax 283 sub $32, %edx 284 add %eax, %edx 285 jl L(return_null) 286 lea 32(%ecx, %eax), %eax 287 ret 288 289 .p2align 4 290L(matches48_1): 291 bsr %eax, %eax 292 sub $16, %edx 293 add %eax, %edx 294 jl L(return_null) 295 lea 48(%ecx, %eax), %eax 296 ret 297 298 .p2align 4 299L(return_null): 300 xor %eax, %eax 301 ret 302 303 .p2align 4 304L(length_less16_offset0): 305 mov %dl, %cl 306 pcmpeqb (%eax), %xmm1 307 308 mov $1, %edx 309 sal %cl, %edx 310 sub $1, %edx 311 mov %edx, %ecx 312 313 pmovmskb %xmm1, %edx 314 315 and %ecx, %edx 316 test %edx, %edx 317 jz L(return_null) 318 319 bsr %edx, %ecx 320 add %ecx, %eax 321 ret 322 323 .p2align 4 324L(length_less16): 325 punpcklbw %xmm1, %xmm1 326 mov %ecx, %eax 327 punpcklbw %xmm1, %xmm1 328 add $16, %edx 329 jz L(return_null) 330 331 pshufd $0, %xmm1, %xmm1 332 and $15, %ecx 333 jz L(length_less16_offset0) 334 335 PUSH (%edi) 336 mov %cl, %dh 337 add %dl, %dh 338 and $-16, %eax 339 340 sub $16, %dh 341 ja L(length_less16_part2) 342 343 pcmpeqb (%eax), %xmm1 344 pmovmskb %xmm1, %edi 345 346 sar %cl, %edi 347 add %ecx, %eax 348 mov %dl, %cl 349 350 mov $1, %edx 351 sal %cl, %edx 352 sub $1, %edx 353 354 and %edx, %edi 355 test %edi, %edi 356 jz L(ret_null) 357 358 bsr %edi, %edi 359 add %edi, %eax 360 POP (%edi) 361 ret 362 363 CFI_PUSH (%edi) 364 365 .p2align 4 366L(length_less16_part2): 367 movdqa 16(%eax), %xmm2 368 pcmpeqb %xmm1, %xmm2 369 pmovmskb %xmm2, %edi 370 371 mov %cl, %ch 372 373 mov %dh, %cl 374 mov $1, %edx 375 sal %cl, %edx 376 sub $1, %edx 377 378 and %edx, %edi 379 380 test %edi, %edi 381 jnz L(length_less16_part2_return) 382 383 pcmpeqb (%eax), %xmm1 384 pmovmskb %xmm1, %edi 385 386 mov %ch, %cl 387 sar %cl, %edi 388 test %edi, %edi 389 jz L(ret_null) 390 391 bsr %edi, %edi 392 add %edi, %eax 393 xor %ch, %ch 394 add %ecx, %eax 395 POP (%edi) 396 ret 397 398 CFI_PUSH (%edi) 399 400 .p2align 4 401L(length_less16_part2_return): 402 bsr %edi, %edi 403 lea 16(%eax, %edi), %eax 404 POP (%edi) 405 ret 406 407 CFI_PUSH (%edi) 408 409 .p2align 4 410L(ret_null): 411 xor %eax, %eax 412 POP (%edi) 413 ret 414 415END (MEMCHR) 416#endif 417