1/* memcmp - compare memory 2 3 Copyright (C) 2013-2022 Free Software Foundation, Inc. 4 5 This file is part of the GNU C Library. 6 7 The GNU C Library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public 9 License as published by the Free Software Foundation; either 10 version 2.1 of the License, or (at your option) any later version. 11 12 The GNU C Library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with the GNU C Library. If not, see 19 <https://www.gnu.org/licenses/>. */ 20 21#include <sysdep.h> 22 23/* Assumptions: 24 * 25 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 26 */ 27 28#define src1 x0 29#define src2 x1 30#define limit x2 31#define result w0 32 33#define data1 x3 34#define data1w w3 35#define data2 x4 36#define data2w w4 37#define data3 x5 38#define data3w w5 39#define data4 x6 40#define data4w w6 41#define tmp x6 42#define src1end x7 43#define src2end x8 44 45 46ENTRY (memcmp) 47 PTR_ARG (0) 48 PTR_ARG (1) 49 SIZE_ARG (2) 50 51 cmp limit, 16 52 b.lo L(less16) 53 ldp data1, data3, [src1] 54 ldp data2, data4, [src2] 55 ccmp data1, data2, 0, ne 56 ccmp data3, data4, 0, eq 57 b.ne L(return2) 58 59 add src1end, src1, limit 60 add src2end, src2, limit 61 cmp limit, 32 62 b.ls L(last_bytes) 63 cmp limit, 160 64 b.hs L(loop_align) 65 sub limit, limit, 32 66 67 .p2align 4 68L(loop32): 69 ldp data1, data3, [src1, 16] 70 ldp data2, data4, [src2, 16] 71 cmp data1, data2 72 ccmp data3, data4, 0, eq 73 b.ne L(return2) 74 cmp limit, 16 75 b.ls L(last_bytes) 76 77 ldp data1, data3, [src1, 32] 78 ldp data2, data4, [src2, 32] 79 cmp data1, data2 80 ccmp data3, data4, 0, eq 81 b.ne L(return2) 82 add src1, src1, 32 83 add src2, src2, 32 84L(last64): 85 subs limit, limit, 32 86 b.hi L(loop32) 87 88 /* Compare last 1-16 bytes using unaligned access. */ 89L(last_bytes): 90 ldp data1, data3, [src1end, -16] 91 ldp data2, data4, [src2end, -16] 92L(return2): 93 cmp data1, data2 94 csel data1, data1, data3, ne 95 csel data2, data2, data4, ne 96 97 /* Compare data bytes and set return value to 0, -1 or 1. */ 98L(return): 99#ifndef __AARCH64EB__ 100 rev data1, data1 101 rev data2, data2 102#endif 103 cmp data1, data2 104 cset result, ne 105 cneg result, result, lo 106 ret 107 108 .p2align 4 109L(less16): 110 add src1end, src1, limit 111 add src2end, src2, limit 112 tbz limit, 3, L(less8) 113 ldr data1, [src1] 114 ldr data2, [src2] 115 ldr data3, [src1end, -8] 116 ldr data4, [src2end, -8] 117 b L(return2) 118 119 .p2align 4 120L(less8): 121 tbz limit, 2, L(less4) 122 ldr data1w, [src1] 123 ldr data2w, [src2] 124 ldr data3w, [src1end, -4] 125 ldr data4w, [src2end, -4] 126 b L(return2) 127 128L(less4): 129 tbz limit, 1, L(less2) 130 ldrh data1w, [src1] 131 ldrh data2w, [src2] 132 cmp data1w, data2w 133 b.ne L(return) 134L(less2): 135 mov result, 0 136 tbz limit, 0, L(return_zero) 137 ldrb data1w, [src1end, -1] 138 ldrb data2w, [src2end, -1] 139 sub result, data1w, data2w 140L(return_zero): 141 ret 142 143L(loop_align): 144 ldp data1, data3, [src1, 16] 145 ldp data2, data4, [src2, 16] 146 cmp data1, data2 147 ccmp data3, data4, 0, eq 148 b.ne L(return2) 149 150 /* Align src2 and adjust src1, src2 and limit. */ 151 and tmp, src2, 15 152 sub tmp, tmp, 16 153 sub src2, src2, tmp 154 add limit, limit, tmp 155 sub src1, src1, tmp 156 sub limit, limit, 64 + 16 157 158 .p2align 4 159L(loop64): 160 ldr q0, [src1, 16] 161 ldr q1, [src2, 16] 162 subs limit, limit, 64 163 ldr q2, [src1, 32] 164 ldr q3, [src2, 32] 165 eor v0.16b, v0.16b, v1.16b 166 eor v1.16b, v2.16b, v3.16b 167 ldr q2, [src1, 48] 168 ldr q3, [src2, 48] 169 umaxp v0.16b, v0.16b, v1.16b 170 ldr q4, [src1, 64]! 171 ldr q5, [src2, 64]! 172 eor v1.16b, v2.16b, v3.16b 173 eor v2.16b, v4.16b, v5.16b 174 umaxp v1.16b, v1.16b, v2.16b 175 umaxp v0.16b, v0.16b, v1.16b 176 umaxp v0.16b, v0.16b, v0.16b 177 fmov tmp, d0 178 ccmp tmp, 0, 0, hi 179 b.eq L(loop64) 180 181 /* If equal, process last 1-64 bytes using scalar loop. */ 182 add limit, limit, 64 + 16 183 cbz tmp, L(last64) 184 185 /* Determine the 8-byte aligned offset of the first difference. */ 186#ifdef __AARCH64EB__ 187 rev16 tmp, tmp 188#endif 189 rev tmp, tmp 190 clz tmp, tmp 191 bic tmp, tmp, 7 192 sub tmp, tmp, 48 193 ldr data1, [src1, tmp] 194 ldr data2, [src2, tmp] 195#ifndef __AARCH64EB__ 196 rev data1, data1 197 rev data2, data2 198#endif 199 mov result, 1 200 cmp data1, data2 201 cneg result, result, lo 202 ret 203 204END (memcmp) 205#undef bcmp 206weak_alias (memcmp, bcmp) 207#undef __memcmpeq 208strong_alias (memcmp, __memcmpeq) 209libc_hidden_builtin_def (memcmp) 210libc_hidden_def (__memcmpeq) 211