1/* memrchr - find the last occurrence of a byte in a memory block
2
3   Copyright (C) 2015-2022 Free Software Foundation, Inc.
4
5   This file is part of the GNU C Library.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library.  If not, see
19   <https://www.gnu.org/licenses/>.  */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD.
26 * MTE compatible.
27 */
28
29/* Arguments and results.  */
30#define srcin		x0
31#define chrin		w1
32#define cntin		x2
33#define result		x0
34
35#define src		x3
36#define cntrem		x4
37#define synd		x5
38#define shift		x6
39#define	tmp		x7
40#define end		x8
41#define endm1		x9
42
43#define vrepchr		v0
44#define qdata		q1
45#define vdata		v1
46#define vhas_chr	v2
47#define vend		v3
48#define dend		d3
49
50/*
51   Core algorithm:
52   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
53   per byte. We take 4 bits of every comparison byte with shift right and narrow
54   by 4 instruction. Since the bits in the nibble mask reflect the order in
55   which things occur in the original string, counting leading zeros identifies
56   exactly which byte matched.  */
57
58ENTRY (__memrchr)
59	PTR_ARG (0)
60	SIZE_ARG (2)
61	add	end, srcin, cntin
62	sub	endm1, end, 1
63	bic	src, endm1, 15
64	cbz	cntin, L(nomatch)
65	ld1	{vdata.16b}, [src]
66	dup	vrepchr.16b, chrin
67	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
68	neg	shift, end, lsl 2
69	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
70	fmov	synd, dend
71	lsl	synd, synd, shift
72	cbz	synd, L(start_loop)
73
74	clz	synd, synd
75	sub	result, endm1, synd, lsr 2
76	cmp	cntin, synd, lsr 2
77	csel	result, result, xzr, hi
78	ret
79
80L(start_loop):
81	sub	tmp, end, src
82	subs	cntrem, cntin, tmp
83	b.ls	L(nomatch)
84
85	/* Make sure that it won't overread by a 16-byte chunk */
86	add	tmp, cntrem, 15
87	tbnz	tmp, 4, L(loop32_2)
88
89	.p2align 4
90L(loop32):
91	ldr	qdata, [src, -16]!
92	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
93	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94	fmov	synd, dend
95	cbnz	synd, L(end)
96
97L(loop32_2):
98	ldr	qdata, [src, -16]!
99	subs	cntrem, cntrem, 32
100	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
101	b.ls	L(end)
102	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
103	fmov	synd, dend
104	cbz	synd, L(loop32)
105L(end):
106	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
107	fmov	synd, dend
108
109	add	tmp, src, 15
110#ifdef __AARCH64EB__
111	rbit	synd, synd
112#endif
113	clz	synd, synd
114	sub	tmp, tmp, synd, lsr 2
115	cmp	tmp, srcin
116	csel	result, tmp, xzr, hs
117	ret
118
119L(nomatch):
120	mov	result, 0
121	ret
122
123END (__memrchr)
124weak_alias (__memrchr, memrchr)
125libc_hidden_builtin_def (memrchr)
126