1/* strlen optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* ISA level >= 2 for both strlen and wcslen. wcslen uses `pminud` 22 which is SSE4.1. strlen doesn't have an ISA level == 2 23 implementation so the SSE2 implementation must be built with ISA 24 level == 2. */ 25# if ISA_SHOULD_BUILD (2) 26 27# include <sysdep.h> 28 29# ifndef STRLEN 30# define STRLEN __strlen_sse2 31# endif 32 33# ifdef AS_WCSLEN 34# define PMINU pminud 35# define PCMPEQ pcmpeqd 36# define SHIFT_RETURN shrq $2, %rax 37# else 38# define PMINU pminub 39# define PCMPEQ pcmpeqb 40# define SHIFT_RETURN 41# endif 42 43# ifndef SECTION 44# define SECTION(p) p 45# endif 46 47/* Long lived register in strlen(s), strnlen(s, n) are: 48 49 %xmm3 - zero 50 %rdi - s 51 %r10 (s+n) & (~(64-1)) 52 %r11 s+n 53*/ 54 55 56 .section SECTION(.text),"ax",@progbits 57ENTRY(STRLEN) 58 59/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ 60# define FIND_ZERO \ 61 PCMPEQ (%rax), %xmm0; \ 62 PCMPEQ 16(%rax), %xmm1; \ 63 PCMPEQ 32(%rax), %xmm2; \ 64 PCMPEQ 48(%rax), %xmm3; \ 65 pmovmskb %xmm0, %esi; \ 66 pmovmskb %xmm1, %edx; \ 67 pmovmskb %xmm2, %r8d; \ 68 pmovmskb %xmm3, %ecx; \ 69 salq $16, %rdx; \ 70 salq $16, %rcx; \ 71 orq %rsi, %rdx; \ 72 orq %r8, %rcx; \ 73 salq $32, %rcx; \ 74 orq %rcx, %rdx; 75 76# ifdef AS_STRNLEN 77/* Do not read anything when n==0. */ 78 test %RSI_LP, %RSI_LP 79 jne L(n_nonzero) 80 xor %rax, %rax 81 ret 82L(n_nonzero): 83# ifdef AS_WCSLEN 84/* Check for overflow from maxlen * sizeof(wchar_t). If it would 85 overflow the only way this program doesn't have undefined behavior 86 is if there is a null terminator in valid memory so wcslen will 87 suffice. */ 88 mov %RSI_LP, %R10_LP 89 sar $62, %R10_LP 90 jnz OVERFLOW_STRLEN 91 sal $2, %RSI_LP 92# endif 93 94/* Initialize long lived registers. */ 95 add %RDI_LP, %RSI_LP 96 mov %RSI_LP, %R10_LP 97 and $-64, %R10_LP 98 mov %RSI_LP, %R11_LP 99# endif 100 101 pxor %xmm0, %xmm0 102 pxor %xmm1, %xmm1 103 pxor %xmm2, %xmm2 104 pxor %xmm3, %xmm3 105 movq %rdi, %rax 106 movq %rdi, %rcx 107 andq $4095, %rcx 108/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ 109 cmpq $4047, %rcx 110/* We cannot unify this branching as it would be ~6 cycles slower. */ 111 ja L(cross_page) 112 113# ifdef AS_STRNLEN 114/* Test if end is among first 64 bytes. */ 115# define STRNLEN_PROLOG \ 116 mov %r11, %rsi; \ 117 subq %rax, %rsi; \ 118 andq $-64, %rax; \ 119 testq $-64, %rsi; \ 120 je L(strnlen_ret) 121# else 122# define STRNLEN_PROLOG andq $-64, %rax; 123# endif 124 125/* Ignore bits in mask that come before start of string. */ 126# define PROLOG(lab) \ 127 movq %rdi, %rcx; \ 128 xorq %rax, %rcx; \ 129 STRNLEN_PROLOG; \ 130 sarq %cl, %rdx; \ 131 test %rdx, %rdx; \ 132 je L(lab); \ 133 bsfq %rdx, %rax; \ 134 SHIFT_RETURN; \ 135 ret 136 137# ifdef AS_STRNLEN 138 andq $-16, %rax 139 FIND_ZERO 140# else 141 /* Test first 16 bytes unaligned. */ 142 movdqu (%rax), %xmm4 143 PCMPEQ %xmm0, %xmm4 144 pmovmskb %xmm4, %edx 145 test %edx, %edx 146 je L(next48_bytes) 147 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ 148 SHIFT_RETURN 149 ret 150 151L(next48_bytes): 152/* Same as FIND_ZERO except we do not check first 16 bytes. */ 153 andq $-16, %rax 154 PCMPEQ 16(%rax), %xmm1 155 PCMPEQ 32(%rax), %xmm2 156 PCMPEQ 48(%rax), %xmm3 157 pmovmskb %xmm1, %edx 158 pmovmskb %xmm2, %r8d 159 pmovmskb %xmm3, %ecx 160 salq $16, %rdx 161 salq $16, %rcx 162 orq %r8, %rcx 163 salq $32, %rcx 164 orq %rcx, %rdx 165# endif 166 167 /* When no zero byte is found xmm1-3 are zero so we do not have to 168 zero them. */ 169 PROLOG(loop) 170 171 .p2align 4 172L(cross_page): 173 andq $-64, %rax 174 FIND_ZERO 175 PROLOG(loop_init) 176 177# ifdef AS_STRNLEN 178/* We must do this check to correctly handle strnlen (s, -1). */ 179L(strnlen_ret): 180 bts %rsi, %rdx 181 sarq %cl, %rdx 182 test %rdx, %rdx 183 je L(loop_init) 184 bsfq %rdx, %rax 185 SHIFT_RETURN 186 ret 187# endif 188 .p2align 4 189L(loop_init): 190 pxor %xmm1, %xmm1 191 pxor %xmm2, %xmm2 192 pxor %xmm3, %xmm3 193# ifdef AS_STRNLEN 194 .p2align 4 195L(loop): 196 197 addq $64, %rax 198 cmpq %rax, %r10 199 je L(exit_end) 200 201 movdqa (%rax), %xmm0 202 PMINU 16(%rax), %xmm0 203 PMINU 32(%rax), %xmm0 204 PMINU 48(%rax), %xmm0 205 PCMPEQ %xmm3, %xmm0 206 pmovmskb %xmm0, %edx 207 testl %edx, %edx 208 jne L(exit) 209 jmp L(loop) 210 211 .p2align 4 212L(exit_end): 213 cmp %rax, %r11 214 je L(first) /* Do not read when end is at page boundary. */ 215 pxor %xmm0, %xmm0 216 FIND_ZERO 217 218L(first): 219 bts %r11, %rdx 220 bsfq %rdx, %rdx 221 addq %rdx, %rax 222 subq %rdi, %rax 223 SHIFT_RETURN 224 ret 225 226 .p2align 4 227L(exit): 228 pxor %xmm0, %xmm0 229 FIND_ZERO 230 231 bsfq %rdx, %rdx 232 addq %rdx, %rax 233 subq %rdi, %rax 234 SHIFT_RETURN 235 ret 236 237# else 238 239 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ 240 .p2align 4 241L(loop): 242 243 movdqa 64(%rax), %xmm0 244 PMINU 80(%rax), %xmm0 245 PMINU 96(%rax), %xmm0 246 PMINU 112(%rax), %xmm0 247 PCMPEQ %xmm3, %xmm0 248 pmovmskb %xmm0, %edx 249 testl %edx, %edx 250 jne L(exit64) 251 252 subq $-128, %rax 253 254 movdqa (%rax), %xmm0 255 PMINU 16(%rax), %xmm0 256 PMINU 32(%rax), %xmm0 257 PMINU 48(%rax), %xmm0 258 PCMPEQ %xmm3, %xmm0 259 pmovmskb %xmm0, %edx 260 testl %edx, %edx 261 jne L(exit0) 262 jmp L(loop) 263 264 .p2align 4 265L(exit64): 266 addq $64, %rax 267L(exit0): 268 pxor %xmm0, %xmm0 269 FIND_ZERO 270 271 bsfq %rdx, %rdx 272 addq %rdx, %rax 273 subq %rdi, %rax 274 SHIFT_RETURN 275 ret 276 277# endif 278 279END(STRLEN) 280#endif 281