1/* memchr optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20#include <sysdep.h> 21 22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation 23 so we need this to build for ISA V2 builds. */ 24#if ISA_SHOULD_BUILD (2) 25 26# ifndef MEMCHR 27# define MEMCHR __memchr_sse2 28# endif 29# ifdef USE_AS_WMEMCHR 30# define PCMPEQ pcmpeqd 31# define CHAR_PER_VEC 4 32# else 33# define PCMPEQ pcmpeqb 34# define CHAR_PER_VEC 16 35# endif 36 37/* fast SSE2 version with using pmaxub and 64 byte loop */ 38 39 .text 40ENTRY(MEMCHR) 41 movd %esi, %xmm1 42 mov %edi, %ecx 43 44# ifdef __ILP32__ 45 /* Clear the upper 32 bits. */ 46 movl %edx, %edx 47# endif 48# ifdef USE_AS_WMEMCHR 49 test %RDX_LP, %RDX_LP 50 jz L(return_null) 51# else 52 punpcklbw %xmm1, %xmm1 53 test %RDX_LP, %RDX_LP 54 jz L(return_null) 55 punpcklbw %xmm1, %xmm1 56# endif 57 58 and $63, %ecx 59 pshufd $0, %xmm1, %xmm1 60 61 cmp $48, %ecx 62 ja L(crosscache) 63 64 movdqu (%rdi), %xmm0 65 PCMPEQ %xmm1, %xmm0 66 pmovmskb %xmm0, %eax 67 test %eax, %eax 68 69 jnz L(matches_1) 70 sub $CHAR_PER_VEC, %rdx 71 jbe L(return_null) 72 add $16, %rdi 73 and $15, %ecx 74 and $-16, %rdi 75# ifdef USE_AS_WMEMCHR 76 shr $2, %ecx 77# endif 78 add %rcx, %rdx 79 sub $(CHAR_PER_VEC * 4), %rdx 80 jbe L(exit_loop) 81 jmp L(loop_prolog) 82 83 .p2align 4 84L(crosscache): 85 and $15, %ecx 86 and $-16, %rdi 87 movdqa (%rdi), %xmm0 88 89 PCMPEQ %xmm1, %xmm0 90 /* Check if there is a match. */ 91 pmovmskb %xmm0, %eax 92 /* Remove the leading bytes. */ 93 sar %cl, %eax 94 test %eax, %eax 95 je L(unaligned_no_match) 96 /* Check which byte is a match. */ 97 bsf %eax, %eax 98# ifdef USE_AS_WMEMCHR 99 mov %eax, %esi 100 shr $2, %esi 101 sub %rsi, %rdx 102# else 103 sub %rax, %rdx 104# endif 105 jbe L(return_null) 106 add %rdi, %rax 107 add %rcx, %rax 108 ret 109 110 .p2align 4 111L(unaligned_no_match): 112 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using 113 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void 114 possible addition overflow. */ 115 neg %rcx 116 add $16, %rcx 117# ifdef USE_AS_WMEMCHR 118 shr $2, %ecx 119# endif 120 sub %rcx, %rdx 121 jbe L(return_null) 122 add $16, %rdi 123 sub $(CHAR_PER_VEC * 4), %rdx 124 jbe L(exit_loop) 125 126 .p2align 4 127L(loop_prolog): 128 movdqa (%rdi), %xmm0 129 PCMPEQ %xmm1, %xmm0 130 pmovmskb %xmm0, %eax 131 test %eax, %eax 132 jnz L(matches) 133 134 movdqa 16(%rdi), %xmm2 135 PCMPEQ %xmm1, %xmm2 136 pmovmskb %xmm2, %eax 137 test %eax, %eax 138 jnz L(matches16) 139 140 movdqa 32(%rdi), %xmm3 141 PCMPEQ %xmm1, %xmm3 142 pmovmskb %xmm3, %eax 143 test %eax, %eax 144 jnz L(matches32) 145 146 movdqa 48(%rdi), %xmm4 147 PCMPEQ %xmm1, %xmm4 148 add $64, %rdi 149 pmovmskb %xmm4, %eax 150 test %eax, %eax 151 jnz L(matches0) 152 153 test $0x3f, %rdi 154 jz L(align64_loop) 155 156 sub $(CHAR_PER_VEC * 4), %rdx 157 jbe L(exit_loop) 158 159 movdqa (%rdi), %xmm0 160 PCMPEQ %xmm1, %xmm0 161 pmovmskb %xmm0, %eax 162 test %eax, %eax 163 jnz L(matches) 164 165 movdqa 16(%rdi), %xmm2 166 PCMPEQ %xmm1, %xmm2 167 pmovmskb %xmm2, %eax 168 test %eax, %eax 169 jnz L(matches16) 170 171 movdqa 32(%rdi), %xmm3 172 PCMPEQ %xmm1, %xmm3 173 pmovmskb %xmm3, %eax 174 test %eax, %eax 175 jnz L(matches32) 176 177 movdqa 48(%rdi), %xmm3 178 PCMPEQ %xmm1, %xmm3 179 pmovmskb %xmm3, %eax 180 181 add $64, %rdi 182 test %eax, %eax 183 jnz L(matches0) 184 185 mov %rdi, %rcx 186 and $-64, %rdi 187 and $63, %ecx 188# ifdef USE_AS_WMEMCHR 189 shr $2, %ecx 190# endif 191 add %rcx, %rdx 192 193 .p2align 4 194L(align64_loop): 195 sub $(CHAR_PER_VEC * 4), %rdx 196 jbe L(exit_loop) 197 movdqa (%rdi), %xmm0 198 movdqa 16(%rdi), %xmm2 199 movdqa 32(%rdi), %xmm3 200 movdqa 48(%rdi), %xmm4 201 202 PCMPEQ %xmm1, %xmm0 203 PCMPEQ %xmm1, %xmm2 204 PCMPEQ %xmm1, %xmm3 205 PCMPEQ %xmm1, %xmm4 206 207 pmaxub %xmm0, %xmm3 208 pmaxub %xmm2, %xmm4 209 pmaxub %xmm3, %xmm4 210 pmovmskb %xmm4, %eax 211 212 add $64, %rdi 213 214 test %eax, %eax 215 jz L(align64_loop) 216 217 sub $64, %rdi 218 219 pmovmskb %xmm0, %eax 220 test %eax, %eax 221 jnz L(matches) 222 223 pmovmskb %xmm2, %eax 224 test %eax, %eax 225 jnz L(matches16) 226 227 movdqa 32(%rdi), %xmm3 228 PCMPEQ %xmm1, %xmm3 229 230 PCMPEQ 48(%rdi), %xmm1 231 pmovmskb %xmm3, %eax 232 test %eax, %eax 233 jnz L(matches32) 234 235 pmovmskb %xmm1, %eax 236 bsf %eax, %eax 237 lea 48(%rdi, %rax), %rax 238 ret 239 240 .p2align 4 241L(exit_loop): 242 add $(CHAR_PER_VEC * 2), %edx 243 jle L(exit_loop_32) 244 245 movdqa (%rdi), %xmm0 246 PCMPEQ %xmm1, %xmm0 247 pmovmskb %xmm0, %eax 248 test %eax, %eax 249 jnz L(matches) 250 251 movdqa 16(%rdi), %xmm2 252 PCMPEQ %xmm1, %xmm2 253 pmovmskb %xmm2, %eax 254 test %eax, %eax 255 jnz L(matches16) 256 257 movdqa 32(%rdi), %xmm3 258 PCMPEQ %xmm1, %xmm3 259 pmovmskb %xmm3, %eax 260 test %eax, %eax 261 jnz L(matches32_1) 262 sub $CHAR_PER_VEC, %edx 263 jle L(return_null) 264 265 PCMPEQ 48(%rdi), %xmm1 266 pmovmskb %xmm1, %eax 267 test %eax, %eax 268 jnz L(matches48_1) 269 xor %eax, %eax 270 ret 271 272 .p2align 4 273L(exit_loop_32): 274 add $(CHAR_PER_VEC * 2), %edx 275 movdqa (%rdi), %xmm0 276 PCMPEQ %xmm1, %xmm0 277 pmovmskb %xmm0, %eax 278 test %eax, %eax 279 jnz L(matches_1) 280 sub $CHAR_PER_VEC, %edx 281 jbe L(return_null) 282 283 PCMPEQ 16(%rdi), %xmm1 284 pmovmskb %xmm1, %eax 285 test %eax, %eax 286 jnz L(matches16_1) 287 xor %eax, %eax 288 ret 289 290 .p2align 4 291L(matches0): 292 bsf %eax, %eax 293 lea -16(%rax, %rdi), %rax 294 ret 295 296 .p2align 4 297L(matches): 298 bsf %eax, %eax 299 add %rdi, %rax 300 ret 301 302 .p2align 4 303L(matches16): 304 bsf %eax, %eax 305 lea 16(%rax, %rdi), %rax 306 ret 307 308 .p2align 4 309L(matches32): 310 bsf %eax, %eax 311 lea 32(%rax, %rdi), %rax 312 ret 313 314 .p2align 4 315L(matches_1): 316 bsf %eax, %eax 317# ifdef USE_AS_WMEMCHR 318 mov %eax, %esi 319 shr $2, %esi 320 sub %rsi, %rdx 321# else 322 sub %rax, %rdx 323# endif 324 jbe L(return_null) 325 add %rdi, %rax 326 ret 327 328 .p2align 4 329L(matches16_1): 330 bsf %eax, %eax 331# ifdef USE_AS_WMEMCHR 332 mov %eax, %esi 333 shr $2, %esi 334 sub %rsi, %rdx 335# else 336 sub %rax, %rdx 337# endif 338 jbe L(return_null) 339 lea 16(%rdi, %rax), %rax 340 ret 341 342 .p2align 4 343L(matches32_1): 344 bsf %eax, %eax 345# ifdef USE_AS_WMEMCHR 346 mov %eax, %esi 347 shr $2, %esi 348 sub %rsi, %rdx 349# else 350 sub %rax, %rdx 351# endif 352 jbe L(return_null) 353 lea 32(%rdi, %rax), %rax 354 ret 355 356 .p2align 4 357L(matches48_1): 358 bsf %eax, %eax 359# ifdef USE_AS_WMEMCHR 360 mov %eax, %esi 361 shr $2, %esi 362 sub %rsi, %rdx 363# else 364 sub %rax, %rdx 365# endif 366 jbe L(return_null) 367 lea 48(%rdi, %rax), %rax 368 ret 369 370 .p2align 4 371L(return_null): 372 xor %eax, %eax 373 ret 374END(MEMCHR) 375#endif 376