1/* Copyright (C) 2013-2022 Free Software Foundation, Inc. 2 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* Assumptions: 22 * 23 * ARMv8-a, AArch64 24 */ 25 26#define REP8_01 0x0101010101010101 27#define REP8_7f 0x7f7f7f7f7f7f7f7f 28 29/* Parameters and result. */ 30#define src1 x0 31#define src2 x1 32#define limit x2 33#define result x0 34 35/* Internal variables. */ 36#define data1 x3 37#define data1w w3 38#define data2 x4 39#define data2w w4 40#define has_nul x5 41#define diff x6 42#define syndrome x7 43#define tmp1 x8 44#define tmp2 x9 45#define tmp3 x10 46#define zeroones x11 47#define pos x12 48#define mask x13 49#define endloop x14 50#define count mask 51#define offset pos 52#define neg_offset x15 53 54/* Define endian dependent shift operations. 55 On big-endian early bytes are at MSB and on little-endian LSB. 56 LS_FW means shifting towards early bytes. 57 LS_BK means shifting towards later bytes. 58 */ 59#ifdef __AARCH64EB__ 60#define LS_FW lsl 61#define LS_BK lsr 62#else 63#define LS_FW lsr 64#define LS_BK lsl 65#endif 66 67 .text 68 .p2align 6 69 .rep 9 70 nop /* Pad so that the loop below fits a cache line. */ 71 .endr 72ENTRY_ALIGN (strncmp, 0) 73 cbz limit, L(ret0) 74 eor tmp1, src1, src2 75 mov zeroones, #REP8_01 76 tst tmp1, #7 77 and count, src1, #7 78 b.ne L(misaligned8) 79 cbnz count, L(mutual_align) 80 81 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 82 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 83 can be done in parallel across the entire word. */ 84 /* Start of performance-critical section -- one 64B cache line. */ 85L(loop_aligned): 86 ldr data1, [src1], #8 87 ldr data2, [src2], #8 88L(start_realigned): 89 subs limit, limit, #8 90 sub tmp1, data1, zeroones 91 orr tmp2, data1, #REP8_7f 92 eor diff, data1, data2 /* Non-zero if differences found. */ 93 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 94 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 95 ccmp endloop, #0, #0, eq 96 b.eq L(loop_aligned) 97 /* End of performance-critical section -- one 64B cache line. */ 98 99L(full_check): 100#ifndef __AARCH64EB__ 101 orr syndrome, diff, has_nul 102 add limit, limit, 8 /* Rewind limit to before last subs. */ 103L(syndrome_check): 104 /* Limit was reached. Check if the NUL byte or the difference 105 is before the limit. */ 106 rev syndrome, syndrome 107 rev data1, data1 108 clz pos, syndrome 109 rev data2, data2 110 lsl data1, data1, pos 111 cmp limit, pos, lsr #3 112 lsl data2, data2, pos 113 /* But we need to zero-extend (char is unsigned) the value and then 114 perform a signed 32-bit subtraction. */ 115 lsr data1, data1, #56 116 sub result, data1, data2, lsr #56 117 csel result, result, xzr, hi 118 ret 119#else 120 /* Not reached the limit, must have found the end or a diff. */ 121 tbz limit, #63, L(not_limit) 122 add tmp1, limit, 8 123 cbz limit, L(not_limit) 124 125 lsl limit, tmp1, #3 /* Bits -> bytes. */ 126 mov mask, #~0 127 lsr mask, mask, limit 128 bic data1, data1, mask 129 bic data2, data2, mask 130 131 /* Make sure that the NUL byte is marked in the syndrome. */ 132 orr has_nul, has_nul, mask 133 134L(not_limit): 135 /* For big-endian we cannot use the trick with the syndrome value 136 as carry-propagation can corrupt the upper bits if the trailing 137 bytes in the string contain 0x01. */ 138 /* However, if there is no NUL byte in the dword, we can generate 139 the result directly. We can't just subtract the bytes as the 140 MSB might be significant. */ 141 cbnz has_nul, 1f 142 cmp data1, data2 143 cset result, ne 144 cneg result, result, lo 145 ret 1461: 147 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 148 rev tmp3, data1 149 sub tmp1, tmp3, zeroones 150 orr tmp2, tmp3, #REP8_7f 151 bic has_nul, tmp1, tmp2 152 rev has_nul, has_nul 153 orr syndrome, diff, has_nul 154 clz pos, syndrome 155 /* The most-significant-non-zero bit of the syndrome marks either the 156 first bit that is different, or the top bit of the first zero byte. 157 Shifting left now will bring the critical information into the 158 top bits. */ 159L(end_quick): 160 lsl data1, data1, pos 161 lsl data2, data2, pos 162 /* But we need to zero-extend (char is unsigned) the value and then 163 perform a signed 32-bit subtraction. */ 164 lsr data1, data1, #56 165 sub result, data1, data2, lsr #56 166 ret 167#endif 168 169L(mutual_align): 170 /* Sources are mutually aligned, but are not currently at an 171 alignment boundary. Round down the addresses and then mask off 172 the bytes that precede the start point. 173 We also need to adjust the limit calculations, but without 174 overflowing if the limit is near ULONG_MAX. */ 175 bic src1, src1, #7 176 bic src2, src2, #7 177 ldr data1, [src1], #8 178 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 179 ldr data2, [src2], #8 180 mov tmp2, #~0 181 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 182 /* Adjust the limit and ensure it doesn't overflow. */ 183 adds limit, limit, count 184 csinv limit, limit, xzr, lo 185 orr data1, data1, tmp2 186 orr data2, data2, tmp2 187 b L(start_realigned) 188 189 .p2align 6 190 /* Don't bother with dwords for up to 16 bytes. */ 191L(misaligned8): 192 cmp limit, #16 193 b.hs L(try_misaligned_words) 194 195L(byte_loop): 196 /* Perhaps we can do better than this. */ 197 ldrb data1w, [src1], #1 198 ldrb data2w, [src2], #1 199 subs limit, limit, #1 200 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 201 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 202 b.eq L(byte_loop) 203L(done): 204 sub result, data1, data2 205 ret 206 /* Align the SRC1 to a dword by doing a bytewise compare and then do 207 the dword loop. */ 208L(try_misaligned_words): 209 cbz count, L(src1_aligned) 210 211 neg count, count 212 and count, count, #7 213 sub limit, limit, count 214 215L(page_end_loop): 216 ldrb data1w, [src1], #1 217 ldrb data2w, [src2], #1 218 cmp data1w, #1 219 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 220 b.ne L(done) 221 subs count, count, #1 222 b.hi L(page_end_loop) 223 224 /* The following diagram explains the comparison of misaligned strings. 225 The bytes are shown in natural order. For little-endian, it is 226 reversed in the registers. The "x" bytes are before the string. 227 The "|" separates data that is loaded at one time. 228 src1 | a a a a a a a a | b b b c c c c c | . . . 229 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 230 After shifting in each step, the data looks like this: 231 STEP_A STEP_B STEP_C 232 data1 a a a a a a a a b b b c c c c c b b b c c c c c 233 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 234 The bytes with "0" are eliminated from the syndrome via mask. 235 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 236 time from SRC2. The comparison happens in 3 steps. After each step 237 the loop can exit, or read from SRC1 or SRC2. */ 238L(src1_aligned): 239 /* Calculate offset from 8 byte alignment to string start in bits. No 240 need to mask offset since shifts are ignoring upper bits. */ 241 lsl offset, src2, #3 242 bic src2, src2, #0xf 243 mov mask, -1 244 neg neg_offset, offset 245 ldr data1, [src1], #8 246 ldp tmp1, tmp2, [src2], #16 247 LS_BK mask, mask, neg_offset 248 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 249 /* Skip the first compare if data in tmp1 is irrelevant. */ 250 tbnz offset, 6, L(misaligned_mid_loop) 251 252L(loop_misaligned): 253 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 254 LS_FW data2, tmp1, offset 255 LS_BK tmp1, tmp2, neg_offset 256 subs limit, limit, #8 257 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 258 sub has_nul, data1, zeroones 259 eor diff, data1, data2 /* Non-zero if differences found. */ 260 orr tmp3, data1, #REP8_7f 261 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 262 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 263 orr tmp3, endloop, has_nul 264 cbnz tmp3, L(full_check) 265 266 ldr data1, [src1], #8 267L(misaligned_mid_loop): 268 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 269 LS_FW data2, tmp2, offset 270#ifdef __AARCH64EB__ 271 /* For big-endian we do a byte reverse to avoid carry-propagation 272 problem described above. This way we can reuse the has_nul in the 273 next step and also use syndrome value trick at the end. */ 274 rev tmp3, data1 275 #define data1_fixed tmp3 276#else 277 #define data1_fixed data1 278#endif 279 sub has_nul, data1_fixed, zeroones 280 orr tmp3, data1_fixed, #REP8_7f 281 eor diff, data2, data1 /* Non-zero if differences found. */ 282 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 283#ifdef __AARCH64EB__ 284 rev has_nul, has_nul 285#endif 286 cmp limit, neg_offset, lsr #3 287 orr syndrome, diff, has_nul 288 bic syndrome, syndrome, mask /* Ignore later bytes. */ 289 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 290 cbnz tmp3, L(syndrome_check) 291 292 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 293 ldp tmp1, tmp2, [src2], #16 294 cmp limit, #8 295 LS_BK data2, tmp1, neg_offset 296 eor diff, data2, data1 /* Non-zero if differences found. */ 297 orr syndrome, diff, has_nul 298 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 299 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 300 cbnz tmp3, L(syndrome_check) 301 302 ldr data1, [src1], #8 303 sub limit, limit, #8 304 b L(loop_misaligned) 305 306#ifdef __AARCH64EB__ 307L(syndrome_check): 308 clz pos, syndrome 309 cmp pos, limit, lsl #3 310 b.lo L(end_quick) 311#endif 312 313L(ret0): 314 mov result, #0 315 ret 316 317END (strncmp) 318libc_hidden_builtin_def (strncmp) 319