1/* memchr implemented using NEON.
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* For __ARM_NEON__ this file defines memchr.  */
22#ifndef __ARM_NEON__
23# define memchr __memchr_neon
24# undef libc_hidden_builtin_def
25# define libc_hidden_builtin_def(a)
26#endif
27
28	.arch	armv7-a
29	.fpu	neon
30
31
32/* Arguments */
33#define srcin		r0
34#define chrin		r1
35#define cntin		r2
36
37/* Retval */
38#define result		r0	/* Live range does not overlap with srcin */
39
40/* Working registers */
41#define src		r1	/* Live range does not overlap with chrin */
42#define tmp		r3
43#define synd		r0	/* No overlap with srcin or result */
44#define soff		r12
45
46/* Working NEON registers */
47#define vrepchr		q0
48#define vdata0		q1
49#define vdata0_0	d2	/* Lower half of vdata0 */
50#define vdata0_1	d3	/* Upper half of vdata0 */
51#define vdata1		q2
52#define vdata1_0	d4	/* Lower half of vhas_chr0 */
53#define vdata1_1	d5	/* Upper half of vhas_chr0 */
54#define vrepmask	q3
55#define vrepmask0	d6
56#define vrepmask1	d7
57#define vend		q4
58#define vend0		d8
59#define vend1		d9
60
61/*
62 * Core algorithm:
63 *
64 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
65 * byte. Each bit is set if the relevant byte matched the requested character
66 * and cleared otherwise. Since the bits in the syndrome reflect exactly the
67 * order in which things occur in the original string, counting trailing zeros
68 * allows to identify exactly which byte has matched.
69 */
70
71	.thumb_func
72	.p2align 4,,15
73
74ENTRY(memchr)
75	/* Use a simple loop if there are less than 8 bytes to search.  */
76	cmp	cntin, #7
77	bhi	.Llargestr
78	and	chrin, chrin, #0xff
79
80.Lsmallstr:
81	subs	cntin, cntin, #1
82	blo	.Lnotfound	/* Return not found if reached end.  */
83	ldrb	tmp, [srcin], #1
84	cmp	tmp, chrin
85	bne	.Lsmallstr	/* Loop again if not found.  */
86	/* Otherwise fixup address and return.  */
87	sub	result, srcin, #1
88	bx	lr
89
90
91.Llargestr:
92	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
93	/*
94	 * Magic constant 0x8040201008040201 allows us to identify which lane
95	 * matches the requested byte.
96	 */
97	movw	tmp, #0x0201
98	movt	tmp, #0x0804
99	lsl	soff, tmp, #4
100	vmov	vrepmask0, tmp, soff
101	vmov	vrepmask1, tmp, soff
102	/* Work with aligned 32-byte chunks */
103	bic	src, srcin, #31
104	ands	soff, srcin, #31
105	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
106
107	/*
108	 * Input string is not 32-byte aligned. We calculate the syndrome
109	 * value for the aligned 32 bytes block containing the first bytes
110	 * and mask the irrelevant part.
111	 */
112	vld1.8		{vdata0, vdata1}, [src:256]!
113	sub		tmp, soff, #32
114	adds		cntin, cntin, tmp
115	vceq.i8		vdata0, vdata0, vrepchr
116	vceq.i8		vdata1, vdata1, vrepchr
117	vand		vdata0, vdata0, vrepmask
118	vand		vdata1, vdata1, vrepmask
119	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
120	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
121	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
122	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
123	vmov		synd, vdata0_0[0]
124
125	/* Clear the soff lower bits */
126	lsr		synd, synd, soff
127	lsl		synd, synd, soff
128	/* The first block can also be the last */
129	bls		.Lmasklast
130	/* Have we found something already? */
131	cbnz		synd, .Ltail
132
133
134.Lloopintro:
135	vpush	{vend}
136	/* 264/265 correspond to d8/d9 for q4 */
137	cfi_adjust_cfa_offset (16)
138	cfi_rel_offset (264, 0)
139	cfi_rel_offset (265, 8)
140	.p2align 3,,7
141.Lloop:
142	vld1.8		{vdata0, vdata1}, [src:256]!
143	subs		cntin, cntin, #32
144	vceq.i8		vdata0, vdata0, vrepchr
145	vceq.i8		vdata1, vdata1, vrepchr
146	/* If we're out of data we finish regardless of the result. */
147	bls		.Lend
148	/* Use a fast check for the termination condition. */
149	vorr		vend, vdata0, vdata1
150	vorr		vend0, vend0, vend1
151	vmov		synd, tmp, vend0
152	orrs		synd, synd, tmp
153	/* We're not out of data, loop if we haven't found the character. */
154	beq		.Lloop
155
156.Lend:
157	vpop		{vend}
158	cfi_adjust_cfa_offset (-16)
159	cfi_restore (264)
160	cfi_restore (265)
161
162	/* Termination condition found, let's calculate the syndrome value. */
163	vand		vdata0, vdata0, vrepmask
164	vand		vdata1, vdata1, vrepmask
165	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
166	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
167	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
168	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
169	vmov		synd, vdata0_0[0]
170	cbz		synd, .Lnotfound
171	bhi		.Ltail	/* Uses the condition code from
172				   subs cntin, cntin, #32 above.  */
173
174
175.Lmasklast:
176	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
177	neg	cntin, cntin
178	lsl	synd, synd, cntin
179	lsrs	synd, synd, cntin
180	it	eq
181	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
182
183
184.Ltail:
185	/* Count the trailing zeros using bit reversing */
186	rbit	synd, synd
187	/* Compensate the last post-increment */
188	sub	src, src, #32
189	/* Count the leading zeros */
190	clz	synd, synd
191	/* Compute the potential result and return */
192	add	result, src, synd
193	bx	lr
194
195
196.Lnotfound:
197	/* Set result to NULL if not found and return */
198	mov	result, #0
199	bx	lr
200
201END(memchr)
202libc_hidden_builtin_def (memchr)
203