1/* memchr - find a character in a memory zone using base integer registers
2
3   Copyright (C) 2018-2022 Free Software Foundation, Inc.
4
5   This file is part of the GNU C Library.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library.  If not, see
19   <https://www.gnu.org/licenses/>.  */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64
26 * Use base integer registers.
27 */
28
29#ifndef MEMCHR
30# define MEMCHR __memchr_nosimd
31#endif
32
33/* Arguments and results.  */
34#define srcin		x0
35#define chrin		x1
36#define cntin		x2
37
38#define result		x0
39
40#define repchr		x1
41
42#define tmp1		x2
43#define tmp2		x3
44#define tmp3		x4
45#define tmp4		x5
46
47#define src		x6
48#define srcend		x7
49#define srcend16	x8
50
51#define anymore		x9
52
53#define zeroones	x10
54
55#define data1		x11
56#define data2		x12
57
58#define has_chr1	x13
59#define has_chr2	x14
60
61#define REP8_01		0x0101010101010101
62#define REP8_7f		0x7f7f7f7f7f7f7f7f
63
64
65ENTRY_ALIGN (MEMCHR, 6)
66
67	PTR_ARG (0)
68	SIZE_ARG (2)
69
70	/* Do not dereference srcin if no bytes to compare. */
71	cbz	cntin, L(none_chr)
72
73	/* Start address is 16-byte aligned or not? */
74	tst	srcin, 15
75	bic	src, srcin, 15
76
77	mov	zeroones, REP8_01
78	and	repchr, chrin, 255
79	/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
80	mul	repchr, repchr, zeroones
81
82	add	srcend, srcin, cntin
83	/*
84	 * srcend16 is address of the block following the last block.
85	 *
86	 * [A block is 16-byte aligned and sized.]
87	 */
88	add	srcend16, srcend, 15
89	bic	srcend16, srcend16, 15
90
91	b.eq	L(loop)
92
93	/* Load the first block containing start address. */
94	ldp	data1, data2, [src], 16
95
96	lsl	tmp1, srcin, 3
97	mov	tmp2, ~0
98#ifdef __AARCH64EB__
99	lsr	tmp3, tmp2, tmp1
100#else
101	lsl	tmp3, tmp2, tmp1
102#endif
103	/* Start address is in the first or the second qword? */
104	tst	srcin, 8
105
106	/*
107	 * Transform any byte in the block to zero using XOR operation,
108	 * if that byte equals the char to search. In this way, searching
109	 * the char becomes detecting zero in the resulting two qwords.
110	 */
111	eor	data1, data1, repchr
112	eor	data2, data2, repchr
113
114	/*
115	 * Set those unused bytes(before start address) to 0xff, so
116	 * that they will not hit any zero detection.
117	 */
118	orn	tmp1, data1, tmp3
119	orn	tmp2, data2, tmp3
120
121	csinv	data1, tmp1, xzr, eq
122	csel	data2, data2, tmp2, eq
123
124	/*
125	 * When the first and last block are the same, there are two cases:
126	 *  o. Memory range to search is just in one block.
127	 *      ( start address - end address) < 0
128	 *
129	 *  o. Memory range is so large that end address wrap-around.
130	 *      ( start address - end address) > 0
131	 */
132	cmp	srcin, srcend
133	ccmp	src, srcend16, 0, mi
134	csetm	anymore, ne
135	b	L(find_chr)
136
137	.p2align 4
138L(loop):
139	ldp	data1, data2, [src], 16
140
141	subs	anymore, src, srcend16
142
143	/*
144	 * Transform any byte in the block to zero using XOR operation,
145	 * if that byte equals the char to search.
146	 */
147	eor	data1, data1, repchr
148	eor	data2, data2, repchr
149
150L(find_chr):
151	/*
152	 * Use the following integer test to find out if any byte in a
153	 * qword is zero. If do not contain zero-valued byte, test result
154	 * is zero.
155	 *
156	 *  (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
157	 * =
158	 *  (qword - 0x0101010101010101) & ~(qword  | 0x7f7f7f7f7f7f7f7f)
159	 *
160	 */
161	sub	tmp1, data1, zeroones
162	sub	tmp2, data2, zeroones
163
164	orr	tmp3, data1, REP8_7f
165	orr	tmp4, data2, REP8_7f
166
167	bic	has_chr1, tmp1, tmp3
168	bic	has_chr2, tmp2, tmp4
169
170	orr	tmp1, has_chr1, has_chr2
171	ccmp	tmp1, 0, 0, ne
172
173	b.eq	L(loop)
174
175	cbz	has_chr1, 1f
176	sub	result, src, 16
177#ifdef __AARCH64EB__
178	rev	data1, data1
179#else
180	rev	has_chr1, has_chr1
181#endif
182	b	L(done)
183
1841:	cbz	has_chr2, L(none_chr)
185	sub	result, src, 8
186#ifdef __AARCH64EB__
187	rev	data1, data2
188#else
189	rev	has_chr1, has_chr2
190#endif
191
192L(done):
193#ifdef __AARCH64EB__
194	/*
195	 * For big-endian, can not directly use has_chr1/has_chr2 because
196	 * two qwords has been reversed after loading from memory.
197	 * Thus, have to perform char detection on two qwords again, which
198	 * should be byte-swapped this time.
199	 */
200	sub	tmp1, data1, zeroones
201	orr	tmp3, data1, REP8_7f
202	bic	has_chr1, tmp1, tmp3
203	rev	has_chr1, has_chr1
204#endif
205
206	/*
207	 * If the specified char is found in a qword, the corresponding
208	 * byte of in has_chr has value of 1, while this is only true for
209	 * the first occurrence, not other occurrences.
210	 */
211	cmp	anymore, 0
212	clz	tmp1, has_chr1
213	add	result, result, tmp1, lsr 3
214	ccmp	result, srcend, 8, eq	/* NZCV = 8000 */
215	csel	result, result, xzr, mi
216	ret
217
218L(none_chr):
219	mov	result, 0
220	ret
221
222END (MEMCHR)
223libc_hidden_builtin_def (MEMCHR)
224