1/* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library.  If not, see
16   <https://www.gnu.org/licenses/>.  */
17
18/* Bytewise compare two null-terminated strings of length no longer than N.  */
19
20#include <sysdep.h>
21
22	.set noat
23	.set noreorder
24
25/* EV6 only predicts one branch per octaword.  We'll use these to push
26   subsequent branches back to the next bundle.  This will generally add
27   a fetch+decode cycle to older machines, so skip in that case.  */
28#ifdef __alpha_fix__
29# define ev6_unop	unop
30#else
31# define ev6_unop
32#endif
33
34	.text
35
36ENTRY(strncmp)
37#ifdef PROF
38	ldgp	gp, 0(pv)
39	lda	AT, _mcount
40	jsr	AT, (AT), _mcount
41	.prologue 1
42#else
43	.prologue 0
44#endif
45
46	xor	a0, a1, t2	# are s1 and s2 co-aligned?
47	beq	a2, $zerolength
48	ldq_u	t0, 0(a0)	# load asap to give cache time to catch up
49	ldq_u	t1, 0(a1)
50	lda	t3, -1
51	and	t2, 7, t2
52	srl	t3, 1, t6
53	and	a0, 7, t4	# find s1 misalignment
54	and	a1, 7, t5	# find s2 misalignment
55	cmovlt	a2, t6, a2	# bound neg count to LONG_MAX
56	addq	a1, a2, a3	# s2+count
57	addq	a2, t4, a2	# bias count by s1 misalignment
58	and	a2, 7, t10	# ofs of last byte in s1 last word
59	srl	a2, 3, a2	# remaining full words in s1 count
60	bne	t2, $unaligned
61
62	/* On entry to this basic block:
63	   t0 == the first word of s1.
64	   t1 == the first word of s2.
65	   t3 == -1.  */
66$aligned:
67	mskqh	t3, a1, t8	# mask off leading garbage
68	ornot	t1, t8, t1
69	ornot	t0, t8, t0
70	cmpbge	zero, t1, t7	# bits set iff null found
71	beq	a2, $eoc	# check end of count
72	bne	t7, $eos
73	beq	t10, $ant_loop
74
75	/* Aligned compare main loop.
76	   On entry to this basic block:
77	   t0 == an s1 word.
78	   t1 == an s2 word not containing a null.  */
79
80	.align 4
81$a_loop:
82	xor	t0, t1, t2	# e0	:
83	bne	t2, $wordcmp	# .. e1 (zdb)
84	ldq_u	t1, 8(a1)	# e0    :
85	ldq_u	t0, 8(a0)	# .. e1 :
86
87	subq	a2, 1, a2	# e0    :
88	addq	a1, 8, a1	# .. e1 :
89	addq	a0, 8, a0	# e0    :
90	beq	a2, $eoc	# .. e1 :
91
92	cmpbge	zero, t1, t7	# e0    :
93	beq	t7, $a_loop	# .. e1 :
94
95	br	$eos
96
97	/* Alternate aligned compare loop, for when there's no trailing
98	   bytes on the count.  We have to avoid reading too much data.  */
99	.align 4
100$ant_loop:
101	xor	t0, t1, t2	# e0	:
102	ev6_unop
103	ev6_unop
104	bne	t2, $wordcmp	# .. e1 (zdb)
105
106	subq	a2, 1, a2	# e0    :
107	beq	a2, $zerolength	# .. e1 :
108	ldq_u	t1, 8(a1)	# e0    :
109	ldq_u	t0, 8(a0)	# .. e1 :
110
111	addq	a1, 8, a1	# e0    :
112	addq	a0, 8, a0	# .. e1 :
113	cmpbge	zero, t1, t7	# e0    :
114	beq	t7, $ant_loop	# .. e1 :
115
116	br	$eos
117
118	/* The two strings are not co-aligned.  Align s1 and cope.  */
119	/* On entry to this basic block:
120	   t0 == the first word of s1.
121	   t1 == the first word of s2.
122	   t3 == -1.
123	   t4 == misalignment of s1.
124	   t5 == misalignment of s2.
125	  t10 == misalignment of s1 end.  */
126	.align	4
127$unaligned:
128	/* If s1 misalignment is larger than s2 misalignment, we need
129	   extra startup checks to avoid SEGV.  */
130	subq	a1, t4, a1	# adjust s2 for s1 misalignment
131	cmpult	t4, t5, t9
132	subq	a3, 1, a3	# last byte of s2
133	bic	a1, 7, t8
134	mskqh	t3, t5, t7	# mask garbage in s2
135	subq	a3, t8, a3
136	ornot	t1, t7, t7
137	srl	a3, 3, a3	# remaining full words in s2 count
138	beq	t9, $u_head
139
140	/* Failing that, we need to look for both eos and eoc within the
141	   first word of s2.  If we find either, we can continue by
142	   pretending that the next word of s2 is all zeros.  */
143	lda	t2, 0		# next = zero
144	cmpeq	a3, 0, t8	# eoc in the first word of s2?
145	cmpbge	zero, t7, t7	# eos in the first word of s2?
146	or	t7, t8, t8
147	bne	t8, $u_head_nl
148
149	/* We know just enough now to be able to assemble the first
150	   full word of s2.  We can still find a zero at the end of it.
151
152	   On entry to this basic block:
153	   t0 == first word of s1
154	   t1 == first partial word of s2.
155	   t3 == -1.
156	   t10 == ofs of last byte in s1 last word.
157	   t11 == ofs of last byte in s2 last word.  */
158$u_head:
159	ldq_u	t2, 8(a1)	# load second partial s2 word
160	subq	a3, 1, a3
161$u_head_nl:
162	extql	t1, a1, t1	# create first s2 word
163	mskqh	t3, a0, t8
164	extqh	t2, a1, t4
165	ornot	t0, t8, t0	# kill s1 garbage
166	or	t1, t4, t1	# s2 word now complete
167	cmpbge	zero, t0, t7	# find eos in first s1 word
168	ornot	t1, t8, t1	# kill s2 garbage
169	beq	a2, $eoc
170	subq	a2, 1, a2
171	bne	t7, $eos
172	mskql	t3, a1, t8	# mask out s2[1] bits we have seen
173	xor	t0, t1, t4	# compare aligned words
174	or	t2, t8, t8
175	bne	t4, $wordcmp
176	cmpbge	zero, t8, t7	# eos in high bits of s2[1]?
177	cmpeq	a3, 0, t8	# eoc in s2[1]?
178	or	t7, t8, t7
179	bne	t7, $u_final
180
181	/* Unaligned copy main loop.  In order to avoid reading too much,
182	   the loop is structured to detect zeros in aligned words from s2.
183	   This has, unfortunately, effectively pulled half of a loop
184	   iteration out into the head and half into the tail, but it does
185	   prevent nastiness from accumulating in the very thing we want
186	   to run as fast as possible.
187
188	   On entry to this basic block:
189	   t2 == the unshifted low-bits from the next s2 word.
190	   t10 == ofs of last byte in s1 last word.
191	   t11 == ofs of last byte in s2 last word.  */
192	.align 4
193$u_loop:
194	extql	t2, a1, t3	# e0    :
195	ldq_u	t2, 16(a1)	# .. e1 : load next s2 high bits
196	ldq_u	t0, 8(a0)	# e0    : load next s1 word
197	addq	a1, 8, a1	# .. e1 :
198
199	addq	a0, 8, a0	# e0    :
200	subq	a3, 1, a3	# .. e1 :
201	extqh	t2, a1, t1	# e0    :
202	cmpbge	zero, t0, t7	# .. e1 : eos in current s1 word
203
204	or	t1, t3, t1	# e0    :
205	beq	a2, $eoc	# .. e1 : eoc in current s1 word
206	subq	a2, 1, a2	# e0    :
207	cmpbge	zero, t2, t4	# .. e1 : eos in s2[1]
208
209	xor	t0, t1, t3	# e0    : compare the words
210	ev6_unop
211	ev6_unop
212	bne	t7, $eos	# .. e1 :
213
214	cmpeq	a3, 0, t5	# e0    : eoc in s2[1]
215	ev6_unop
216	ev6_unop
217	bne	t3, $wordcmp	# .. e1 :
218
219	or	t4, t5, t4	# e0    : eos or eoc in s2[1].
220	beq	t4, $u_loop	# .. e1 (zdb)
221
222	/* We've found a zero in the low bits of the last s2 word.  Get
223	   the next s1 word and align them.  */
224	.align 3
225$u_final:
226	ldq_u	t0, 8(a0)
227	extql	t2, a1, t1
228	cmpbge	zero, t1, t7
229	bne	a2, $eos
230
231	/* We've hit end of count.  Zero everything after the count
232	   and compare whats left.  */
233	.align 3
234$eoc:
235	mskql	t0, t10, t0
236	mskql	t1, t10, t1
237	cmpbge	zero, t1, t7
238
239	/* We've found a zero somewhere in a word we just read.
240	   On entry to this basic block:
241	   t0 == s1 word
242	   t1 == s2 word
243	   t7 == cmpbge mask containing the zero.  */
244	.align 3
245$eos:
246	negq	t7, t6		# create bytemask of valid data
247	and	t6, t7, t8
248	subq	t8, 1, t6
249	or	t6, t8, t7
250	zapnot	t0, t7, t0	# kill the garbage
251	zapnot	t1, t7, t1
252	xor	t0, t1, v0	# ... and compare
253	beq	v0, $done
254
255	/* Here we have two differing co-aligned words in t0 & t1.
256	   Bytewise compare them and return (t0 > t1 ? 1 : -1).  */
257	.align 3
258$wordcmp:
259	cmpbge	t0, t1, t2	# comparison yields bit mask of ge
260	cmpbge	t1, t0, t3
261	xor	t2, t3, t0	# bits set iff t0/t1 bytes differ
262	negq	t0, t1		# clear all but least bit
263	and	t0, t1, t0
264	lda	v0, -1
265	and	t0, t2, t1	# was bit set in t0 > t1?
266	cmovne	t1, 1, v0
267$done:
268	ret
269
270	.align 3
271$zerolength:
272	clr	v0
273	ret
274
275	END(strncmp)
276libc_hidden_builtin_def (strncmp)
277