1/* Optimized memchr with sse2 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# define CFI_PUSH(REG) \ 24 cfi_adjust_cfa_offset (4); \ 25 cfi_rel_offset (REG, 0) 26 27# define CFI_POP(REG) \ 28 cfi_adjust_cfa_offset (-4); \ 29 cfi_restore (REG) 30 31# define PUSH(REG) pushl REG; CFI_PUSH (REG) 32# define POP(REG) popl REG; CFI_POP (REG) 33 34# define PARMS 4 35# define STR1 PARMS 36# define STR2 STR1+4 37 38# ifndef USE_AS_RAWMEMCHR 39# define LEN STR2+4 40# define RETURN POP(%edi); ret; CFI_PUSH(%edi); 41# endif 42 43# ifndef MEMCHR 44# define MEMCHR __memchr_sse2_bsf 45# endif 46 47 .text 48ENTRY (MEMCHR) 49 50 mov STR1(%esp), %ecx 51 movd STR2(%esp), %xmm1 52 53# ifndef USE_AS_RAWMEMCHR 54 mov LEN(%esp), %edx 55 test %edx, %edx 56 jz L(return_null_1) 57# endif 58 mov %ecx, %eax 59 60 punpcklbw %xmm1, %xmm1 61 punpcklbw %xmm1, %xmm1 62 63 and $63, %ecx 64 pshufd $0, %xmm1, %xmm1 65 66 cmp $48, %ecx 67 ja L(crosscache) 68 69 movdqu (%eax), %xmm0 70 pcmpeqb %xmm1, %xmm0 71/* Check if there is a match. */ 72 pmovmskb %xmm0, %ecx 73 test %ecx, %ecx 74 je L(unaligned_no_match_1) 75/* Check which byte is a match. */ 76 bsf %ecx, %ecx 77 78# ifndef USE_AS_RAWMEMCHR 79 sub %ecx, %edx 80 jbe L(return_null_1) 81# endif 82 add %ecx, %eax 83 ret 84 85 .p2align 4 86L(unaligned_no_match_1): 87# ifndef USE_AS_RAWMEMCHR 88 sub $16, %edx 89 jbe L(return_null_1) 90 PUSH (%edi) 91 lea 16(%eax), %edi 92 and $15, %eax 93 and $-16, %edi 94 add %eax, %edx 95# else 96 lea 16(%eax), %edx 97 and $-16, %edx 98# endif 99 jmp L(loop_prolog) 100 101 .p2align 4 102L(return_null_1): 103 xor %eax, %eax 104 ret 105 106# ifndef USE_AS_RAWMEMCHR 107 CFI_POP (%edi) 108# endif 109 110 .p2align 4 111L(crosscache): 112/* Handle unaligned string. */ 113 114# ifndef USE_AS_RAWMEMCHR 115 PUSH (%edi) 116 mov %eax, %edi 117 and $15, %ecx 118 and $-16, %edi 119 movdqa (%edi), %xmm0 120# else 121 mov %eax, %edx 122 and $15, %ecx 123 and $-16, %edx 124 movdqa (%edx), %xmm0 125# endif 126 pcmpeqb %xmm1, %xmm0 127/* Check if there is a match. */ 128 pmovmskb %xmm0, %eax 129/* Remove the leading bytes. */ 130 sar %cl, %eax 131 test %eax, %eax 132 je L(unaligned_no_match) 133/* Check which byte is a match. */ 134 bsf %eax, %eax 135 136# ifndef USE_AS_RAWMEMCHR 137 sub %eax, %edx 138 jbe L(return_null) 139 add %edi, %eax 140 add %ecx, %eax 141 RETURN 142# else 143 add %edx, %eax 144 add %ecx, %eax 145 ret 146# endif 147 148 .p2align 4 149L(unaligned_no_match): 150# ifndef USE_AS_RAWMEMCHR 151 /* Calculate the last acceptable address and check for possible 152 addition overflow by using satured math: 153 edx = ecx + edx 154 edx |= -(edx < ecx) */ 155 add %ecx, %edx 156 sbb %eax, %eax 157 or %eax, %edx 158 sub $16, %edx 159 jbe L(return_null) 160 add $16, %edi 161# else 162 add $16, %edx 163# endif 164 165 .p2align 4 166/* Loop start on aligned string. */ 167L(loop_prolog): 168# ifndef USE_AS_RAWMEMCHR 169 sub $64, %edx 170 jbe L(exit_loop) 171 movdqa (%edi), %xmm0 172# else 173 movdqa (%edx), %xmm0 174# endif 175 pcmpeqb %xmm1, %xmm0 176 pmovmskb %xmm0, %eax 177 test %eax, %eax 178 jnz L(matches) 179 180# ifndef USE_AS_RAWMEMCHR 181 movdqa 16(%edi), %xmm2 182# else 183 movdqa 16(%edx), %xmm2 184# endif 185 pcmpeqb %xmm1, %xmm2 186 pmovmskb %xmm2, %eax 187 test %eax, %eax 188 jnz L(matches16) 189 190# ifndef USE_AS_RAWMEMCHR 191 movdqa 32(%edi), %xmm3 192# else 193 movdqa 32(%edx), %xmm3 194# endif 195 pcmpeqb %xmm1, %xmm3 196 pmovmskb %xmm3, %eax 197 test %eax, %eax 198 jnz L(matches32) 199 200# ifndef USE_AS_RAWMEMCHR 201 movdqa 48(%edi), %xmm4 202# else 203 movdqa 48(%edx), %xmm4 204# endif 205 pcmpeqb %xmm1, %xmm4 206 207# ifndef USE_AS_RAWMEMCHR 208 add $64, %edi 209# else 210 add $64, %edx 211# endif 212 pmovmskb %xmm4, %eax 213 test %eax, %eax 214 jnz L(matches0) 215 216# ifndef USE_AS_RAWMEMCHR 217 test $0x3f, %edi 218# else 219 test $0x3f, %edx 220# endif 221 jz L(align64_loop) 222 223# ifndef USE_AS_RAWMEMCHR 224 sub $64, %edx 225 jbe L(exit_loop) 226 movdqa (%edi), %xmm0 227# else 228 movdqa (%edx), %xmm0 229# endif 230 pcmpeqb %xmm1, %xmm0 231 pmovmskb %xmm0, %eax 232 test %eax, %eax 233 jnz L(matches) 234 235# ifndef USE_AS_RAWMEMCHR 236 movdqa 16(%edi), %xmm2 237# else 238 movdqa 16(%edx), %xmm2 239# endif 240 pcmpeqb %xmm1, %xmm2 241 pmovmskb %xmm2, %eax 242 test %eax, %eax 243 jnz L(matches16) 244 245# ifndef USE_AS_RAWMEMCHR 246 movdqa 32(%edi), %xmm3 247# else 248 movdqa 32(%edx), %xmm3 249# endif 250 pcmpeqb %xmm1, %xmm3 251 pmovmskb %xmm3, %eax 252 test %eax, %eax 253 jnz L(matches32) 254 255# ifndef USE_AS_RAWMEMCHR 256 movdqa 48(%edi), %xmm3 257# else 258 movdqa 48(%edx), %xmm3 259# endif 260 pcmpeqb %xmm1, %xmm3 261 pmovmskb %xmm3, %eax 262 263# ifndef USE_AS_RAWMEMCHR 264 add $64, %edi 265# else 266 add $64, %edx 267# endif 268 test %eax, %eax 269 jnz L(matches0) 270 271# ifndef USE_AS_RAWMEMCHR 272 mov %edi, %ecx 273 and $-64, %edi 274 and $63, %ecx 275 add %ecx, %edx 276# else 277 and $-64, %edx 278# endif 279 280 .p2align 4 281L(align64_loop): 282# ifndef USE_AS_RAWMEMCHR 283 sub $64, %edx 284 jbe L(exit_loop) 285 movdqa (%edi), %xmm0 286 movdqa 16(%edi), %xmm2 287 movdqa 32(%edi), %xmm3 288 movdqa 48(%edi), %xmm4 289# else 290 movdqa (%edx), %xmm0 291 movdqa 16(%edx), %xmm2 292 movdqa 32(%edx), %xmm3 293 movdqa 48(%edx), %xmm4 294# endif 295 pcmpeqb %xmm1, %xmm0 296 pcmpeqb %xmm1, %xmm2 297 pcmpeqb %xmm1, %xmm3 298 pcmpeqb %xmm1, %xmm4 299 300 pmaxub %xmm0, %xmm3 301 pmaxub %xmm2, %xmm4 302 pmaxub %xmm3, %xmm4 303 pmovmskb %xmm4, %eax 304 305# ifndef USE_AS_RAWMEMCHR 306 add $64, %edi 307# else 308 add $64, %edx 309# endif 310 311 test %eax, %eax 312 jz L(align64_loop) 313 314# ifndef USE_AS_RAWMEMCHR 315 sub $64, %edi 316# else 317 sub $64, %edx 318# endif 319 320 pmovmskb %xmm0, %eax 321 test %eax, %eax 322 jnz L(matches) 323 324 pmovmskb %xmm2, %eax 325 test %eax, %eax 326 jnz L(matches16) 327 328# ifndef USE_AS_RAWMEMCHR 329 movdqa 32(%edi), %xmm3 330# else 331 movdqa 32(%edx), %xmm3 332# endif 333 334 pcmpeqb %xmm1, %xmm3 335 336# ifndef USE_AS_RAWMEMCHR 337 pcmpeqb 48(%edi), %xmm1 338# else 339 pcmpeqb 48(%edx), %xmm1 340# endif 341 pmovmskb %xmm3, %eax 342 test %eax, %eax 343 jnz L(matches32) 344 345 pmovmskb %xmm1, %eax 346 bsf %eax, %eax 347 348# ifndef USE_AS_RAWMEMCHR 349 lea 48(%edi, %eax), %eax 350 RETURN 351# else 352 lea 48(%edx, %eax), %eax 353 ret 354# endif 355 356# ifndef USE_AS_RAWMEMCHR 357 .p2align 4 358L(exit_loop): 359 add $64, %edx 360 cmp $32, %edx 361 jbe L(exit_loop_32) 362 363 movdqa (%edi), %xmm0 364 pcmpeqb %xmm1, %xmm0 365 pmovmskb %xmm0, %eax 366 test %eax, %eax 367 jnz L(matches) 368 369 movdqa 16(%edi), %xmm2 370 pcmpeqb %xmm1, %xmm2 371 pmovmskb %xmm2, %eax 372 test %eax, %eax 373 jnz L(matches16) 374 375 movdqa 32(%edi), %xmm3 376 pcmpeqb %xmm1, %xmm3 377 pmovmskb %xmm3, %eax 378 test %eax, %eax 379 jnz L(matches32_1) 380 cmp $48, %edx 381 jbe L(return_null) 382 383 pcmpeqb 48(%edi), %xmm1 384 pmovmskb %xmm1, %eax 385 test %eax, %eax 386 jnz L(matches48_1) 387 xor %eax, %eax 388 RETURN 389 390 .p2align 4 391L(exit_loop_32): 392 movdqa (%edi), %xmm0 393 pcmpeqb %xmm1, %xmm0 394 pmovmskb %xmm0, %eax 395 test %eax, %eax 396 jnz L(matches_1) 397 cmp $16, %edx 398 jbe L(return_null) 399 400 pcmpeqb 16(%edi), %xmm1 401 pmovmskb %xmm1, %eax 402 test %eax, %eax 403 jnz L(matches16_1) 404 xor %eax, %eax 405 RETURN 406# endif 407 .p2align 4 408L(matches0): 409 bsf %eax, %eax 410# ifndef USE_AS_RAWMEMCHR 411 lea -16(%eax, %edi), %eax 412 RETURN 413# else 414 lea -16(%eax, %edx), %eax 415 ret 416# endif 417 418 .p2align 4 419L(matches): 420 bsf %eax, %eax 421# ifndef USE_AS_RAWMEMCHR 422 add %edi, %eax 423 RETURN 424# else 425 add %edx, %eax 426 ret 427# endif 428 429 .p2align 4 430L(matches16): 431 bsf %eax, %eax 432# ifndef USE_AS_RAWMEMCHR 433 lea 16(%eax, %edi), %eax 434 RETURN 435# else 436 lea 16(%eax, %edx), %eax 437 ret 438# endif 439 440 .p2align 4 441L(matches32): 442 bsf %eax, %eax 443# ifndef USE_AS_RAWMEMCHR 444 lea 32(%eax, %edi), %eax 445 RETURN 446# else 447 lea 32(%eax, %edx), %eax 448 ret 449# endif 450 451# ifndef USE_AS_RAWMEMCHR 452 .p2align 4 453L(matches_1): 454 bsf %eax, %eax 455 sub %eax, %edx 456 jbe L(return_null) 457 458 add %edi, %eax 459 RETURN 460 461 .p2align 4 462L(matches16_1): 463 sub $16, %edx 464 bsf %eax, %eax 465 sub %eax, %edx 466 jbe L(return_null) 467 468 lea 16(%edi, %eax), %eax 469 RETURN 470 471 .p2align 4 472L(matches32_1): 473 sub $32, %edx 474 bsf %eax, %eax 475 sub %eax, %edx 476 jbe L(return_null) 477 478 lea 32(%edi, %eax), %eax 479 RETURN 480 481 .p2align 4 482L(matches48_1): 483 sub $48, %edx 484 bsf %eax, %eax 485 sub %eax, %edx 486 jbe L(return_null) 487 488 lea 48(%edi, %eax), %eax 489 RETURN 490# endif 491 .p2align 4 492L(return_null): 493 xor %eax, %eax 494# ifndef USE_AS_RAWMEMCHR 495 RETURN 496# else 497 ret 498# endif 499 500END (MEMCHR) 501#endif 502