1/* strcmp with unaligned loads 2 Copyright (C) 2013-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* Continue building as ISA level 2. We use this as ISA V2 default 22 because strcmp-sse42 uses pcmpstri (slow on some SSE4.2 23 processors) and this implementation is potenially faster than 24 strcmp-sse42 (aside from the slower page cross case). */ 25#if ISA_SHOULD_BUILD (2) 26 27# define STRCMP_ISA _sse2_unaligned 28# include "strcmp-naming.h" 29 30# include "sysdep.h" 31 32ENTRY (STRCMP) 33 movl %edi, %eax 34 xorl %edx, %edx 35 pxor %xmm7, %xmm7 36 orl %esi, %eax 37 andl $4095, %eax 38 cmpl $4032, %eax 39 jg L(cross_page) 40 movdqu (%rdi), %xmm1 41 movdqu (%rsi), %xmm0 42 pcmpeqb %xmm1, %xmm0 43 pminub %xmm1, %xmm0 44 pxor %xmm1, %xmm1 45 pcmpeqb %xmm1, %xmm0 46 pmovmskb %xmm0, %eax 47 testq %rax, %rax 48 je L(next_48_bytes) 49L(return): 50 bsfq %rax, %rdx 51 movzbl (%rdi, %rdx), %eax 52 movzbl (%rsi, %rdx), %edx 53 subl %edx, %eax 54 ret 55 56 .p2align 4 57L(next_48_bytes): 58 movdqu 16(%rdi), %xmm6 59 movdqu 16(%rsi), %xmm3 60 movdqu 32(%rdi), %xmm5 61 pcmpeqb %xmm6, %xmm3 62 movdqu 32(%rsi), %xmm2 63 pminub %xmm6, %xmm3 64 pcmpeqb %xmm1, %xmm3 65 movdqu 48(%rdi), %xmm4 66 pcmpeqb %xmm5, %xmm2 67 pmovmskb %xmm3, %edx 68 movdqu 48(%rsi), %xmm0 69 pminub %xmm5, %xmm2 70 pcmpeqb %xmm1, %xmm2 71 pcmpeqb %xmm4, %xmm0 72 pmovmskb %xmm2, %eax 73 salq $16, %rdx 74 pminub %xmm4, %xmm0 75 pcmpeqb %xmm1, %xmm0 76 salq $32, %rax 77 orq %rdx, %rax 78 pmovmskb %xmm0, %ecx 79 movq %rcx, %rdx 80 salq $48, %rdx 81 orq %rdx, %rax 82 jne L(return) 83L(main_loop_header): 84 leaq 64(%rdi), %rdx 85 movl $4096, %ecx 86 pxor %xmm9, %xmm9 87 andq $-64, %rdx 88 subq %rdi, %rdx 89 leaq (%rdi, %rdx), %rax 90 addq %rsi, %rdx 91 movq %rdx, %rsi 92 andl $4095, %esi 93 subq %rsi, %rcx 94 shrq $6, %rcx 95 movq %rcx, %rsi 96 jmp L(loop_start) 97 98 .p2align 4 99L(loop): 100 addq $64, %rax 101 addq $64, %rdx 102L(loop_start): 103 testq %rsi, %rsi 104 leaq -1(%rsi), %rsi 105 je L(loop_cross_page) 106L(back_to_loop): 107 movdqu (%rdx), %xmm0 108 movdqu 16(%rdx), %xmm1 109 movdqa (%rax), %xmm2 110 movdqa 16(%rax), %xmm3 111 pcmpeqb %xmm2, %xmm0 112 movdqu 32(%rdx), %xmm5 113 pcmpeqb %xmm3, %xmm1 114 pminub %xmm2, %xmm0 115 movdqu 48(%rdx), %xmm6 116 pminub %xmm3, %xmm1 117 movdqa 32(%rax), %xmm2 118 pminub %xmm1, %xmm0 119 movdqa 48(%rax), %xmm3 120 pcmpeqb %xmm2, %xmm5 121 pcmpeqb %xmm3, %xmm6 122 pminub %xmm2, %xmm5 123 pminub %xmm3, %xmm6 124 pminub %xmm5, %xmm0 125 pminub %xmm6, %xmm0 126 pcmpeqb %xmm7, %xmm0 127 pmovmskb %xmm0, %ecx 128 testl %ecx, %ecx 129 je L(loop) 130 pcmpeqb %xmm7, %xmm5 131 movdqu (%rdx), %xmm0 132 pcmpeqb %xmm7, %xmm1 133 movdqa (%rax), %xmm2 134 pcmpeqb %xmm2, %xmm0 135 pminub %xmm2, %xmm0 136 pcmpeqb %xmm7, %xmm6 137 pcmpeqb %xmm7, %xmm0 138 pmovmskb %xmm1, %ecx 139 pmovmskb %xmm5, %r8d 140 pmovmskb %xmm0, %edi 141 salq $16, %rcx 142 salq $32, %r8 143 pmovmskb %xmm6, %esi 144 orq %r8, %rcx 145 orq %rdi, %rcx 146 salq $48, %rsi 147 orq %rsi, %rcx 148 bsfq %rcx, %rcx 149 movzbl (%rax, %rcx), %eax 150 movzbl (%rdx, %rcx), %edx 151 subl %edx, %eax 152 ret 153 154 .p2align 4 155L(loop_cross_page): 156 xor %r10, %r10 157 movq %rdx, %r9 158 and $63, %r9 159 subq %r9, %r10 160 161 movdqa (%rdx, %r10), %xmm0 162 movdqa 16(%rdx, %r10), %xmm1 163 movdqu (%rax, %r10), %xmm2 164 movdqu 16(%rax, %r10), %xmm3 165 pcmpeqb %xmm2, %xmm0 166 movdqa 32(%rdx, %r10), %xmm5 167 pcmpeqb %xmm3, %xmm1 168 pminub %xmm2, %xmm0 169 movdqa 48(%rdx, %r10), %xmm6 170 pminub %xmm3, %xmm1 171 movdqu 32(%rax, %r10), %xmm2 172 movdqu 48(%rax, %r10), %xmm3 173 pcmpeqb %xmm2, %xmm5 174 pcmpeqb %xmm3, %xmm6 175 pminub %xmm2, %xmm5 176 pminub %xmm3, %xmm6 177 178 pcmpeqb %xmm7, %xmm0 179 pcmpeqb %xmm7, %xmm1 180 pcmpeqb %xmm7, %xmm5 181 pcmpeqb %xmm7, %xmm6 182 183 pmovmskb %xmm1, %ecx 184 pmovmskb %xmm5, %r8d 185 pmovmskb %xmm0, %edi 186 salq $16, %rcx 187 salq $32, %r8 188 pmovmskb %xmm6, %esi 189 orq %r8, %rdi 190 orq %rcx, %rdi 191 salq $48, %rsi 192 orq %rsi, %rdi 193 movq %r9, %rcx 194 movq $63, %rsi 195 shrq %cl, %rdi 196 test %rdi, %rdi 197 je L(back_to_loop) 198 bsfq %rdi, %rcx 199 movzbl (%rax, %rcx), %eax 200 movzbl (%rdx, %rcx), %edx 201 subl %edx, %eax 202 ret 203 204 .p2align 4 205L(cross_page_loop): 206 cmpb %cl, %al 207 jne L(different) 208 addq $1, %rdx 209 cmpq $64, %rdx 210 je L(main_loop_header) 211L(cross_page): 212 movzbl (%rdi, %rdx), %eax 213 movzbl (%rsi, %rdx), %ecx 214 testb %al, %al 215 jne L(cross_page_loop) 216 xorl %eax, %eax 217L(different): 218 subl %ecx, %eax 219 ret 220END (STRCMP) 221#endif 222