1/* strstr with unaligned loads 2 Copyright (C) 2009-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "../strchr-isa-default-impl.h" 21 22ENTRY(__strstr_sse2_unaligned) 23 movzbl (%rsi), %eax 24 testb %al, %al 25 je L(empty) 26 movzbl 1(%rsi), %edx 27 testb %dl, %dl 28 je L(strchr) 29 movd %eax, %xmm1 30 movd %edx, %xmm2 31 movq %rdi, %rax 32 andl $4095, %eax 33 punpcklbw %xmm1, %xmm1 34 cmpq $4031, %rax 35 punpcklbw %xmm2, %xmm2 36 punpcklwd %xmm1, %xmm1 37 punpcklwd %xmm2, %xmm2 38 pshufd $0, %xmm1, %xmm1 39 pshufd $0, %xmm2, %xmm2 40 ja L(cross_page) 41 movdqu (%rdi), %xmm3 42 pxor %xmm5, %xmm5 43 movdqu 1(%rdi), %xmm4 44 movdqa %xmm3, %xmm6 45 pcmpeqb %xmm1, %xmm3 46 pcmpeqb %xmm2, %xmm4 47 movdqu 16(%rdi), %xmm0 48 pcmpeqb %xmm5, %xmm6 49 pminub %xmm4, %xmm3 50 movdqa %xmm3, %xmm4 51 movdqu 17(%rdi), %xmm3 52 pcmpeqb %xmm0, %xmm5 53 pcmpeqb %xmm2, %xmm3 54 por %xmm6, %xmm4 55 pcmpeqb %xmm1, %xmm0 56 pminub %xmm3, %xmm0 57 por %xmm5, %xmm0 58 pmovmskb %xmm4, %r8d 59 pmovmskb %xmm0, %eax 60 salq $16, %rax 61 orq %rax, %r8 62 je L(next_32_bytes) 63L(next_pair_index): 64 bsf %r8, %rax 65 addq %rdi, %rax 66 cmpb $0, (%rax) 67 je L(zero1) 68 movzbl 2(%rsi), %edx 69 testb %dl, %dl 70 je L(found1) 71 cmpb 2(%rax), %dl 72 jne L(next_pair) 73 xorl %edx, %edx 74 jmp L(pair_loop_start) 75 76 .p2align 4 77L(strchr): 78 movzbl %al, %esi 79 jmp DEFAULT_STRCHR 80 81 .p2align 4 82L(pair_loop): 83 addq $1, %rdx 84 cmpb 2(%rax,%rdx), %cl 85 jne L(next_pair) 86L(pair_loop_start): 87 movzbl 3(%rsi,%rdx), %ecx 88 testb %cl, %cl 89 jne L(pair_loop) 90L(found1): 91 ret 92L(zero1): 93 xorl %eax, %eax 94 ret 95 96 .p2align 4 97L(next_pair): 98 leaq -1(%r8), %rax 99 andq %rax, %r8 100 jne L(next_pair_index) 101 102 .p2align 4 103L(next_32_bytes): 104 movdqu 32(%rdi), %xmm3 105 pxor %xmm5, %xmm5 106 movdqu 33(%rdi), %xmm4 107 movdqa %xmm3, %xmm6 108 pcmpeqb %xmm1, %xmm3 109 pcmpeqb %xmm2, %xmm4 110 movdqu 48(%rdi), %xmm0 111 pcmpeqb %xmm5, %xmm6 112 pminub %xmm4, %xmm3 113 movdqa %xmm3, %xmm4 114 movdqu 49(%rdi), %xmm3 115 pcmpeqb %xmm0, %xmm5 116 pcmpeqb %xmm2, %xmm3 117 por %xmm6, %xmm4 118 pcmpeqb %xmm1, %xmm0 119 pminub %xmm3, %xmm0 120 por %xmm5, %xmm0 121 pmovmskb %xmm4, %eax 122 salq $32, %rax 123 pmovmskb %xmm0, %r8d 124 salq $48, %r8 125 orq %rax, %r8 126 je L(loop_header) 127L(next_pair2_index): 128 bsfq %r8, %rax 129 addq %rdi, %rax 130 cmpb $0, (%rax) 131 je L(zero2) 132 movzbl 2(%rsi), %edx 133 testb %dl, %dl 134 je L(found2) 135 cmpb 2(%rax), %dl 136 jne L(next_pair2) 137 xorl %edx, %edx 138 jmp L(pair_loop2_start) 139 140 .p2align 4 141L(pair_loop2): 142 addq $1, %rdx 143 cmpb 2(%rax,%rdx), %cl 144 jne L(next_pair2) 145L(pair_loop2_start): 146 movzbl 3(%rsi,%rdx), %ecx 147 testb %cl, %cl 148 jne L(pair_loop2) 149L(found2): 150 ret 151 L(zero2): 152 xorl %eax, %eax 153 ret 154L(empty): 155 mov %rdi, %rax 156 ret 157 158 .p2align 4 159L(next_pair2): 160 leaq -1(%r8), %rax 161 andq %rax, %r8 162 jne L(next_pair2_index) 163L(loop_header): 164 movq $-512, %r11 165 movq %rdi, %r9 166 167 pxor %xmm7, %xmm7 168 andq $-64, %rdi 169 170 .p2align 4 171L(loop): 172 movdqa 64(%rdi), %xmm3 173 movdqu 63(%rdi), %xmm6 174 movdqa %xmm3, %xmm0 175 pxor %xmm2, %xmm3 176 pxor %xmm1, %xmm6 177 movdqa 80(%rdi), %xmm10 178 por %xmm3, %xmm6 179 pminub %xmm10, %xmm0 180 movdqu 79(%rdi), %xmm3 181 pxor %xmm2, %xmm10 182 pxor %xmm1, %xmm3 183 movdqa 96(%rdi), %xmm9 184 por %xmm10, %xmm3 185 pminub %xmm9, %xmm0 186 pxor %xmm2, %xmm9 187 movdqa 112(%rdi), %xmm8 188 addq $64, %rdi 189 pminub %xmm6, %xmm3 190 movdqu 31(%rdi), %xmm4 191 pminub %xmm8, %xmm0 192 pxor %xmm2, %xmm8 193 pxor %xmm1, %xmm4 194 por %xmm9, %xmm4 195 pminub %xmm4, %xmm3 196 movdqu 47(%rdi), %xmm5 197 pxor %xmm1, %xmm5 198 por %xmm8, %xmm5 199 pminub %xmm5, %xmm3 200 pminub %xmm3, %xmm0 201 pcmpeqb %xmm7, %xmm0 202 pmovmskb %xmm0, %eax 203 testl %eax, %eax 204 je L(loop) 205 pminub (%rdi), %xmm6 206 pminub 32(%rdi),%xmm4 207 pminub 48(%rdi),%xmm5 208 pcmpeqb %xmm7, %xmm6 209 pcmpeqb %xmm7, %xmm5 210 pmovmskb %xmm6, %edx 211 movdqa 16(%rdi), %xmm8 212 pcmpeqb %xmm7, %xmm4 213 movdqu 15(%rdi), %xmm0 214 pmovmskb %xmm5, %r8d 215 movdqa %xmm8, %xmm3 216 pmovmskb %xmm4, %ecx 217 pcmpeqb %xmm1,%xmm0 218 pcmpeqb %xmm2,%xmm3 219 salq $32, %rcx 220 pcmpeqb %xmm7,%xmm8 221 salq $48, %r8 222 pminub %xmm0,%xmm3 223 orq %rcx, %rdx 224 por %xmm3,%xmm8 225 orq %rdx, %r8 226 pmovmskb %xmm8, %eax 227 salq $16, %rax 228 orq %rax, %r8 229 je L(loop) 230L(next_pair_index3): 231 bsfq %r8, %rcx 232 addq %rdi, %rcx 233 cmpb $0, (%rcx) 234 je L(zero) 235 xorl %eax, %eax 236 movzbl 2(%rsi), %edx 237 testb %dl, %dl 238 je L(success3) 239 cmpb 1(%rcx), %dl 240 jne L(next_pair3) 241 jmp L(pair_loop_start3) 242 243 .p2align 4 244L(pair_loop3): 245 addq $1, %rax 246 cmpb 1(%rcx,%rax), %dl 247 jne L(next_pair3) 248L(pair_loop_start3): 249 movzbl 3(%rsi,%rax), %edx 250 testb %dl, %dl 251 jne L(pair_loop3) 252L(success3): 253 lea -1(%rcx), %rax 254 ret 255 256 .p2align 4 257L(next_pair3): 258 addq %rax, %r11 259 movq %rdi, %rax 260 subq %r9, %rax 261 cmpq %r11, %rax 262 jl L(switch_strstr) 263 leaq -1(%r8), %rax 264 andq %rax, %r8 265 jne L(next_pair_index3) 266 jmp L(loop) 267 268 .p2align 4 269L(switch_strstr): 270 movq %rdi, %rdi 271 jmp __strstr_generic 272 273 .p2align 4 274L(cross_page): 275 276 movq %rdi, %rax 277 pxor %xmm0, %xmm0 278 andq $-64, %rax 279 movdqa (%rax), %xmm3 280 movdqu -1(%rax), %xmm4 281 movdqa %xmm3, %xmm8 282 movdqa 16(%rax), %xmm5 283 pcmpeqb %xmm1, %xmm4 284 pcmpeqb %xmm0, %xmm8 285 pcmpeqb %xmm2, %xmm3 286 movdqa %xmm5, %xmm7 287 pminub %xmm4, %xmm3 288 movdqu 15(%rax), %xmm4 289 pcmpeqb %xmm0, %xmm7 290 por %xmm3, %xmm8 291 movdqa %xmm5, %xmm3 292 movdqa 32(%rax), %xmm5 293 pcmpeqb %xmm1, %xmm4 294 pcmpeqb %xmm2, %xmm3 295 movdqa %xmm5, %xmm6 296 pmovmskb %xmm8, %ecx 297 pminub %xmm4, %xmm3 298 movdqu 31(%rax), %xmm4 299 por %xmm3, %xmm7 300 movdqa %xmm5, %xmm3 301 pcmpeqb %xmm0, %xmm6 302 movdqa 48(%rax), %xmm5 303 pcmpeqb %xmm1, %xmm4 304 pmovmskb %xmm7, %r8d 305 pcmpeqb %xmm2, %xmm3 306 pcmpeqb %xmm5, %xmm0 307 pminub %xmm4, %xmm3 308 movdqu 47(%rax), %xmm4 309 por %xmm3, %xmm6 310 movdqa %xmm5, %xmm3 311 salq $16, %r8 312 pcmpeqb %xmm1, %xmm4 313 pcmpeqb %xmm2, %xmm3 314 pmovmskb %xmm6, %r10d 315 pminub %xmm4, %xmm3 316 por %xmm3, %xmm0 317 salq $32, %r10 318 orq %r10, %r8 319 orq %rcx, %r8 320 movl %edi, %ecx 321 pmovmskb %xmm0, %edx 322 subl %eax, %ecx 323 salq $48, %rdx 324 orq %rdx, %r8 325 shrq %cl, %r8 326 je L(loop_header) 327L(next_pair_index4): 328 bsfq %r8, %rax 329 addq %rdi, %rax 330 cmpb $0, (%rax) 331 je L(zero) 332 333 cmpq %rax,%rdi 334 je L(next_pair4) 335 336 movzbl 2(%rsi), %edx 337 testb %dl, %dl 338 je L(found3) 339 cmpb 1(%rax), %dl 340 jne L(next_pair4) 341 xorl %edx, %edx 342 jmp L(pair_loop_start4) 343 344 .p2align 4 345L(pair_loop4): 346 addq $1, %rdx 347 cmpb 1(%rax,%rdx), %cl 348 jne L(next_pair4) 349L(pair_loop_start4): 350 movzbl 3(%rsi,%rdx), %ecx 351 testb %cl, %cl 352 jne L(pair_loop4) 353L(found3): 354 subq $1, %rax 355 ret 356 357 .p2align 4 358L(next_pair4): 359 leaq -1(%r8), %rax 360 andq %rax, %r8 361 jne L(next_pair_index4) 362 jmp L(loop_header) 363 364 .p2align 4 365L(found): 366 rep 367 ret 368 369 .p2align 4 370L(zero): 371 xorl %eax, %eax 372 ret 373 374 375END(__strstr_sse2_unaligned) 376