1/* Optimized strlen implementation using SIMD. 2 Copyright (C) 2018-2022 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library. If not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22/* Assumptions: 23 * 24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 25 * Not MTE compatible. 26 */ 27 28#define srcin x0 29#define len x0 30 31#define src x1 32#define data1 x2 33#define data2 x3 34#define has_nul1 x4 35#define has_nul2 x5 36#define tmp1 x4 37#define tmp2 x5 38#define tmp3 x6 39#define tmp4 x7 40#define zeroones x8 41 42#define maskv v0 43#define maskd d0 44#define dataq1 q1 45#define dataq2 q2 46#define datav1 v1 47#define datav2 v2 48#define tmp x2 49#define tmpw w2 50#define synd x3 51#define shift x4 52 53/* For the first 32 bytes, NUL detection works on the principle that 54 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 55 byte is zero, and can be done in parallel across the entire word. */ 56 57#define REP8_01 0x0101010101010101 58#define REP8_7f 0x7f7f7f7f7f7f7f7f 59 60/* To test the page crossing code path more thoroughly, compile with 61 -DTEST_PAGE_CROSS - this will force all calls through the slower 62 entry path. This option is not intended for production use. */ 63 64#ifdef TEST_PAGE_CROSS 65# define MIN_PAGE_SIZE 32 66#else 67# define MIN_PAGE_SIZE 4096 68#endif 69 70/* Core algorithm: 71 72 Since strings are short on average, we check the first 32 bytes of the 73 string for a NUL character without aligning the string. In order to use 74 unaligned loads safely we must do a page cross check first. 75 76 If there is a NUL byte we calculate the length from the 2 8-byte words 77 using conditional select to reduce branch mispredictions (it is unlikely 78 strlen will be repeatedly called on strings with the same length). 79 80 If the string is longer than 32 bytes, align src so we don't need further 81 page cross checks, and process 32 bytes per iteration using a fast SIMD 82 loop. 83 84 If the page cross check fails, we read 32 bytes from an aligned address, 85 and ignore any characters before the string. If it contains a NUL 86 character, return the length, if not, continue in the main loop. */ 87 88ENTRY (__strlen_asimd) 89 PTR_ARG (0) 90 91 and tmp1, srcin, MIN_PAGE_SIZE - 1 92 cmp tmp1, MIN_PAGE_SIZE - 32 93 b.hi L(page_cross) 94 95 /* Look for a NUL byte in the first 16 bytes. */ 96 ldp data1, data2, [srcin] 97 mov zeroones, REP8_01 98 99#ifdef __AARCH64EB__ 100 /* For big-endian, carry propagation (if the final byte in the 101 string is 0x01) means we cannot use has_nul1/2 directly. 102 Since we expect strings to be small and early-exit, 103 byte-swap the data now so has_null1/2 will be correct. */ 104 rev data1, data1 105 rev data2, data2 106#endif 107 sub tmp1, data1, zeroones 108 orr tmp2, data1, REP8_7f 109 sub tmp3, data2, zeroones 110 orr tmp4, data2, REP8_7f 111 bics has_nul1, tmp1, tmp2 112 bic has_nul2, tmp3, tmp4 113 ccmp has_nul2, 0, 0, eq 114 b.eq L(bytes16_31) 115 116 /* Find the exact offset of the first NUL byte in the first 16 bytes 117 from the string start. Enter with C = has_nul1 == 0. */ 118 csel has_nul1, has_nul1, has_nul2, cc 119 mov len, 8 120 rev has_nul1, has_nul1 121 csel len, xzr, len, cc 122 clz tmp1, has_nul1 123 add len, len, tmp1, lsr 3 124 ret 125 126 .p2align 3 127 /* Look for a NUL byte at offset 16..31 in the string. */ 128L(bytes16_31): 129 ldp data1, data2, [srcin, 16] 130#ifdef __AARCH64EB__ 131 rev data1, data1 132 rev data2, data2 133#endif 134 sub tmp1, data1, zeroones 135 orr tmp2, data1, REP8_7f 136 sub tmp3, data2, zeroones 137 orr tmp4, data2, REP8_7f 138 bics has_nul1, tmp1, tmp2 139 bic has_nul2, tmp3, tmp4 140 ccmp has_nul2, 0, 0, eq 141 b.eq L(loop_entry) 142 143 /* Find the exact offset of the first NUL byte at offset 16..31 from 144 the string start. Enter with C = has_nul1 == 0. */ 145 csel has_nul1, has_nul1, has_nul2, cc 146 mov len, 24 147 rev has_nul1, has_nul1 148 mov tmp3, 16 149 clz tmp1, has_nul1 150 csel len, tmp3, len, cc 151 add len, len, tmp1, lsr 3 152 ret 153 154L(loop_entry): 155 bic src, srcin, 31 156 157 .p2align 5 158L(loop): 159 ldp dataq1, dataq2, [src, 32]! 160 uminp maskv.16b, datav1.16b, datav2.16b 161 uminp maskv.16b, maskv.16b, maskv.16b 162 cmeq maskv.8b, maskv.8b, 0 163 fmov synd, maskd 164 cbz synd, L(loop) 165 166 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 167 cmeq maskv.16b, datav1.16b, 0 168 sub len, src, srcin 169 tst synd, 0xffffffff 170 b.ne 1f 171 cmeq maskv.16b, datav2.16b, 0 172 add len, len, 16 1731: 174 /* Generate a bitmask and compute correct byte offset. */ 175#ifdef __AARCH64EB__ 176 bic maskv.8h, 0xf0 177#else 178 bic maskv.8h, 0x0f, lsl 8 179#endif 180 umaxp maskv.16b, maskv.16b, maskv.16b 181 fmov synd, maskd 182#ifndef __AARCH64EB__ 183 rbit synd, synd 184#endif 185 clz tmp, synd 186 add len, len, tmp, lsr 2 187 ret 188 189 .p2align 4 190 191L(page_cross): 192 bic src, srcin, 31 193 mov tmpw, 0x0c03 194 movk tmpw, 0xc030, lsl 16 195 ld1 {datav1.16b, datav2.16b}, [src] 196 dup maskv.4s, tmpw 197 cmeq datav1.16b, datav1.16b, 0 198 cmeq datav2.16b, datav2.16b, 0 199 and datav1.16b, datav1.16b, maskv.16b 200 and datav2.16b, datav2.16b, maskv.16b 201 addp maskv.16b, datav1.16b, datav2.16b 202 addp maskv.16b, maskv.16b, maskv.16b 203 fmov synd, maskd 204 lsl shift, srcin, 1 205 lsr synd, synd, shift 206 cbz synd, L(loop) 207 208 rbit synd, synd 209 clz len, synd 210 lsr len, len, 1 211 ret 212 213END (__strlen_asimd) 214libc_hidden_builtin_def (__strlen_asimd) 215