1/* memcmp - compare memory
2
3   Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5   This file is part of the GNU C Library.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library.  If not, see
19   <https://www.gnu.org/licenses/>.  */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
26 */
27
28#define src1	x0
29#define src2	x1
30#define limit	x2
31#define result	w0
32
33#define data1	x3
34#define data1w	w3
35#define data2	x4
36#define data2w	w4
37#define data3	x5
38#define data3w	w5
39#define data4	x6
40#define data4w	w6
41#define tmp	x6
42#define src1end	x7
43#define src2end	x8
44
45
46ENTRY (memcmp)
47	PTR_ARG (0)
48	PTR_ARG (1)
49	SIZE_ARG (2)
50
51	cmp	limit, 16
52	b.lo	L(less16)
53	ldp	data1, data3, [src1]
54	ldp	data2, data4, [src2]
55	ccmp	data1, data2, 0, ne
56	ccmp	data3, data4, 0, eq
57	b.ne	L(return2)
58
59	add	src1end, src1, limit
60	add	src2end, src2, limit
61	cmp	limit, 32
62	b.ls	L(last_bytes)
63	cmp	limit, 160
64	b.hs	L(loop_align)
65	sub	limit, limit, 32
66
67	.p2align 4
68L(loop32):
69	ldp	data1, data3, [src1, 16]
70	ldp	data2, data4, [src2, 16]
71	cmp	data1, data2
72	ccmp	data3, data4, 0, eq
73	b.ne	L(return2)
74	cmp	limit, 16
75	b.ls	L(last_bytes)
76
77	ldp	data1, data3, [src1, 32]
78	ldp	data2, data4, [src2, 32]
79	cmp	data1, data2
80	ccmp	data3, data4, 0, eq
81	b.ne	L(return2)
82	add	src1, src1, 32
83	add	src2, src2, 32
84L(last64):
85	subs	limit, limit, 32
86	b.hi	L(loop32)
87
88	/* Compare last 1-16 bytes using unaligned access.  */
89L(last_bytes):
90	ldp	data1, data3, [src1end, -16]
91	ldp	data2, data4, [src2end, -16]
92L(return2):
93	cmp	data1, data2
94	csel	data1, data1, data3, ne
95	csel	data2, data2, data4, ne
96
97	/* Compare data bytes and set return value to 0, -1 or 1.  */
98L(return):
99#ifndef __AARCH64EB__
100	rev	data1, data1
101	rev	data2, data2
102#endif
103	cmp	data1, data2
104	cset	result, ne
105	cneg	result, result, lo
106	ret
107
108	.p2align 4
109L(less16):
110	add	src1end, src1, limit
111	add	src2end, src2, limit
112	tbz	limit, 3, L(less8)
113	ldr	data1, [src1]
114	ldr	data2, [src2]
115	ldr	data3, [src1end, -8]
116	ldr	data4, [src2end, -8]
117	b	L(return2)
118
119	.p2align 4
120L(less8):
121	tbz	limit, 2, L(less4)
122	ldr	data1w, [src1]
123	ldr	data2w, [src2]
124	ldr	data3w, [src1end, -4]
125	ldr	data4w, [src2end, -4]
126	b	L(return2)
127
128L(less4):
129	tbz	limit, 1, L(less2)
130	ldrh	data1w, [src1]
131	ldrh	data2w, [src2]
132	cmp	data1w, data2w
133	b.ne	L(return)
134L(less2):
135	mov	result, 0
136	tbz	limit, 0, L(return_zero)
137	ldrb	data1w, [src1end, -1]
138	ldrb	data2w, [src2end, -1]
139	sub	result, data1w, data2w
140L(return_zero):
141	ret
142
143L(loop_align):
144	ldp	data1, data3, [src1, 16]
145	ldp	data2, data4, [src2, 16]
146	cmp	data1, data2
147	ccmp	data3, data4, 0, eq
148	b.ne	L(return2)
149
150	/* Align src2 and adjust src1, src2 and limit.  */
151	and	tmp, src2, 15
152	sub	tmp, tmp, 16
153	sub	src2, src2, tmp
154	add	limit, limit, tmp
155	sub	src1, src1, tmp
156	sub	limit, limit, 64 + 16
157
158	.p2align 4
159L(loop64):
160	ldr	q0, [src1, 16]
161	ldr	q1, [src2, 16]
162	subs	limit, limit, 64
163	ldr	q2, [src1, 32]
164	ldr	q3, [src2, 32]
165	eor	v0.16b, v0.16b, v1.16b
166	eor	v1.16b, v2.16b, v3.16b
167	ldr	q2, [src1, 48]
168	ldr	q3, [src2, 48]
169	umaxp	v0.16b, v0.16b, v1.16b
170	ldr	q4, [src1, 64]!
171	ldr	q5, [src2, 64]!
172	eor	v1.16b, v2.16b, v3.16b
173	eor	v2.16b, v4.16b, v5.16b
174	umaxp	v1.16b, v1.16b, v2.16b
175	umaxp	v0.16b, v0.16b, v1.16b
176	umaxp	v0.16b, v0.16b, v0.16b
177	fmov	tmp, d0
178	ccmp	tmp, 0, 0, hi
179	b.eq	L(loop64)
180
181	/* If equal, process last 1-64 bytes using scalar loop.  */
182	add	limit, limit, 64 + 16
183	cbz	tmp, L(last64)
184
185	/* Determine the 8-byte aligned offset of the first difference.  */
186#ifdef __AARCH64EB__
187	rev16	tmp, tmp
188#endif
189	rev	tmp, tmp
190	clz	tmp, tmp
191	bic	tmp, tmp, 7
192	sub	tmp, tmp, 48
193	ldr	data1, [src1, tmp]
194	ldr	data2, [src2, tmp]
195#ifndef __AARCH64EB__
196	rev	data1, data1
197	rev	data2, data2
198#endif
199	mov	result, 1
200	cmp	data1, data2
201	cneg	result, result, lo
202	ret
203
204END (memcmp)
205#undef bcmp
206weak_alias (memcmp, bcmp)
207#undef __memcmpeq
208strong_alias (memcmp, __memcmpeq)
209libc_hidden_builtin_def (memcmp)
210libc_hidden_def (__memcmpeq)
211