1/* rawmemchr optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20#include <sysdep.h> 21 22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation 23 so we need this to build for ISA V2 builds. */ 24#if ISA_SHOULD_BUILD (2) 25 26# ifndef RAWMEMCHR 27# define RAWMEMCHR __rawmemchr_sse2 28# endif 29 30 .text 31ENTRY (RAWMEMCHR) 32 movd %rsi, %xmm1 33 mov %rdi, %rcx 34 35 punpcklbw %xmm1, %xmm1 36 punpcklbw %xmm1, %xmm1 37 38 and $63, %rcx 39 pshufd $0, %xmm1, %xmm1 40 41 cmp $48, %rcx 42 ja L(crosscache) 43 44 movdqu (%rdi), %xmm0 45 pcmpeqb %xmm1, %xmm0 46/* Check if there is a match. */ 47 pmovmskb %xmm0, %eax 48 test %eax, %eax 49 50 jnz L(matches) 51 add $16, %rdi 52 and $-16, %rdi 53 jmp L(loop_prolog) 54 55 .p2align 4 56L(crosscache): 57 and $15, %rcx 58 and $-16, %rdi 59 movdqa (%rdi), %xmm0 60 61 pcmpeqb %xmm1, %xmm0 62/* Check if there is a match. */ 63 pmovmskb %xmm0, %eax 64/* Remove the leading bytes. */ 65 sar %cl, %eax 66 test %eax, %eax 67 je L(unaligned_no_match) 68/* Check which byte is a match. */ 69 bsf %eax, %eax 70 71 add %rdi, %rax 72 add %rcx, %rax 73 ret 74 75 .p2align 4 76L(unaligned_no_match): 77 add $16, %rdi 78 79 .p2align 4 80L(loop_prolog): 81 movdqa (%rdi), %xmm0 82 pcmpeqb %xmm1, %xmm0 83 pmovmskb %xmm0, %eax 84 test %eax, %eax 85 jnz L(matches) 86 87 movdqa 16(%rdi), %xmm2 88 pcmpeqb %xmm1, %xmm2 89 pmovmskb %xmm2, %eax 90 test %eax, %eax 91 jnz L(matches16) 92 93 movdqa 32(%rdi), %xmm3 94 pcmpeqb %xmm1, %xmm3 95 pmovmskb %xmm3, %eax 96 test %eax, %eax 97 jnz L(matches32) 98 99 movdqa 48(%rdi), %xmm4 100 pcmpeqb %xmm1, %xmm4 101 add $64, %rdi 102 pmovmskb %xmm4, %eax 103 test %eax, %eax 104 jnz L(matches0) 105 106 test $0x3f, %rdi 107 jz L(align64_loop) 108 109 movdqa (%rdi), %xmm0 110 pcmpeqb %xmm1, %xmm0 111 pmovmskb %xmm0, %eax 112 test %eax, %eax 113 jnz L(matches) 114 115 movdqa 16(%rdi), %xmm2 116 pcmpeqb %xmm1, %xmm2 117 pmovmskb %xmm2, %eax 118 test %eax, %eax 119 jnz L(matches16) 120 121 movdqa 32(%rdi), %xmm3 122 pcmpeqb %xmm1, %xmm3 123 pmovmskb %xmm3, %eax 124 test %eax, %eax 125 jnz L(matches32) 126 127 movdqa 48(%rdi), %xmm3 128 pcmpeqb %xmm1, %xmm3 129 pmovmskb %xmm3, %eax 130 131 add $64, %rdi 132 test %eax, %eax 133 jnz L(matches0) 134 135 and $-64, %rdi 136 137 .p2align 4 138L(align64_loop): 139 movdqa (%rdi), %xmm0 140 movdqa 16(%rdi), %xmm2 141 movdqa 32(%rdi), %xmm3 142 movdqa 48(%rdi), %xmm4 143 144 pcmpeqb %xmm1, %xmm0 145 pcmpeqb %xmm1, %xmm2 146 pcmpeqb %xmm1, %xmm3 147 pcmpeqb %xmm1, %xmm4 148 149 pmaxub %xmm0, %xmm3 150 pmaxub %xmm2, %xmm4 151 pmaxub %xmm3, %xmm4 152 pmovmskb %xmm4, %eax 153 154 add $64, %rdi 155 156 test %eax, %eax 157 jz L(align64_loop) 158 159 sub $64, %rdi 160 161 pmovmskb %xmm0, %eax 162 test %eax, %eax 163 jnz L(matches) 164 165 pmovmskb %xmm2, %eax 166 test %eax, %eax 167 jnz L(matches16) 168 169 movdqa 32(%rdi), %xmm3 170 pcmpeqb %xmm1, %xmm3 171 172 pcmpeqb 48(%rdi), %xmm1 173 pmovmskb %xmm3, %eax 174 test %eax, %eax 175 jnz L(matches32) 176 177 pmovmskb %xmm1, %eax 178 bsf %eax, %eax 179 lea 48(%rdi, %rax), %rax 180 ret 181 182 .p2align 4 183L(matches0): 184 bsf %eax, %eax 185 lea -16(%rax, %rdi), %rax 186 ret 187 188 .p2align 4 189L(matches): 190 bsf %eax, %eax 191 add %rdi, %rax 192 ret 193 194 .p2align 4 195L(matches16): 196 bsf %eax, %eax 197 lea 16(%rax, %rdi), %rax 198 ret 199 200 .p2align 4 201L(matches32): 202 bsf %eax, %eax 203 lea 32(%rax, %rdi), %rax 204 ret 205 206END (RAWMEMCHR) 207#endif 208