1/* Optimized strncmp implementation for PowerPC64/POWER9.
2   Copyright (C) 2016-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18#include <sysdep.h>
19
20/* Implements the function
21
22   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
23
24   The implementation uses unaligned doubleword access to avoid specialized
25   code paths depending of data alignment for first 32 bytes and uses
26   vectorised loops after that.  */
27
28#ifndef STRNCMP
29# define STRNCMP strncmp
30#endif
31
32/* TODO: Change this to actual instructions when minimum binutils is upgraded
33   to 2.27.  Macros are defined below for these newer instructions in order
34   to maintain compatibility.  */
35#define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
36
37#define VEXTUBRX(t,a,b) .long (0x1000070d \
38				| ((t)<<(32-11))  \
39				| ((a)<<(32-16))  \
40				| ((b)<<(32-21)) )
41
42#define VCMPNEZB(t,a,b) .long (0x10000507 \
43				| ((t)<<(32-11))  \
44				| ((a)<<(32-16))  \
45				| ((b)<<(32-21)) )
46
47/* Get 16 bytes for unaligned case.
48   reg1: Vector to hold next 16 bytes.
49   reg2: Address to read from.
50   reg3: Permute control vector.  */
51#define GET16BYTES(reg1, reg2, reg3) \
52	lvx	reg1, 0, reg2; \
53	vperm	v8, v2, reg1, reg3; \
54	vcmpequb.	v8, v0, v8; \
55	beq	cr6, 1f; \
56	vspltisb	v9, 0; \
57	b	2f; \
58	.align 4; \
591: \
60	cmplw	cr6, r5, r11; \
61	ble	cr6, 2f; \
62	addi	r6, reg2, 16; \
63	lvx	v9, 0, r6; \
642: \
65	vperm	reg1, v9, reg1, reg3;
66
67/* TODO: change this to .machine power9 when minimum binutils
68   is upgraded to 2.27.  */
69	.machine  power7
70ENTRY_TOCLESS (STRNCMP, 4)
71	/* Check if size is 0.  */
72	cmpdi	cr0, r5, 0
73	beq	cr0, L(ret0)
74	li	r0, 0
75
76	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
77	   the code:
78
79	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
80
81	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
82	rldicl	r8, r3, 0, 52
83	cmpldi	cr7, r8, 4096-32
84	bgt	cr7, L(pagecross)
85	rldicl	r9, r4, 0, 52
86	cmpldi	cr7, r9, 4096-32
87	bgt	cr7, L(pagecross)
88
89	/* For short strings up to 32 bytes, load both s1 and s2 using
90	   unaligned dwords and compare.  */
91
92	ld	r7, 0(r3)
93	ld	r9, 0(r4)
94	li	r8, 0
95	cmpb	r8, r7, r8
96	cmpb	r6, r7, r9
97	orc.	r8, r8, r6
98	bne	cr0, L(different1)
99
100	/* If the strings compared are equal, but size is less or equal
101	   to 8, return 0.  */
102	cmpldi	cr7, r5, 8
103	li	r9, 0
104	ble	cr7, L(ret1)
105	addi	r5, r5, -8
106
107	ld	r7, 8(r3)
108	ld	r9, 8(r4)
109	cmpb	r8, r7, r8
110	cmpb	r6, r7, r9
111	orc.	r8, r8, r6
112	bne	cr0, L(different1)
113	cmpldi	cr7, r5, 8
114	mr	r9, r8
115	ble	cr7, L(ret1)
116	/* Update pointers and size.  */
117	addi	r5, r5, -8
118	addi	r3, r3, 16
119	addi	r4, r4, 16
120
121	ld	r7, 0(r3)
122	ld	r9, 0(r4)
123	li	r8, 0
124	cmpb	r8, r7, r8
125	cmpb	r6, r7, r9
126	orc.	r8, r8, r6
127	bne	cr0, L(different1)
128	cmpldi	cr7, r5, 8
129	li	r9, 0
130	ble	cr7, L(ret1)
131	addi	r5, r5, -8
132
133	ld	r7, 8(r3)
134	ld	r9, 8(r4)
135	cmpb	r8, r7, r8
136	cmpb	r6, r7, r9
137	orc.	r8, r8, r6
138	bne	cr0, L(different1)
139	cmpldi	cr7, r5, 8
140	mr	r9, r8
141	ble	cr7, L(ret1)
142
143	/* Update pointers and size.  */
144	addi	r5, r5, -8
145	addi	r3, r3, 16
146	addi	r4, r4, 16
147L(align):
148	/* Now it has checked for first 32 bytes, align source1 to doubleword
149	   and adjust source2 address.  */
150	vspltisb	v0, 0
151	vspltisb	v2, -1
152	or	r6, r4, r3
153	andi.	r6, r6, 0xF
154	beq	cr0, L(aligned)
155	lvsr	v6, 0, r4   /* Compute mask.  */
156	clrldi	r6, r4, 60
157	subfic	r11, r6, 16
158	andi.	r6, r3, 0xF
159	beq	cr0, L(s1_align)
160	/* Both s1 and s2 are unaligned.  */
161	GET16BYTES(v5, r4, v6)
162	lvsr	v10, 0, r3   /* Compute mask.  */
163	clrldi	r6, r3, 60
164	subfic	r11, r6, 16
165	GET16BYTES(v4, r3, v10)
166	VCMPNEZB(v7, v5, v4)
167	beq	cr6, L(match)
168	b	L(different)
169
170	/* Align s1 to qw and adjust s2 address.  */
171	.align  4
172L(match):
173	cmpldi	cr7, r5, 16
174	ble	cr7, L(ret0)
175	subf	r5, r11, r5
176	add	r3, r3, r11
177	add	r4, r4, r11
178	andi.	r11, r4, 0xF
179	beq	cr0, L(aligned)
180	lvsr	v6, 0, r4
181	clrldi	r6, r4, 60
182	subfic	r11, r6, 16
183	/* There are 2 loops depending on the input alignment.
184	   Each loop gets 16 bytes from s1 and s2, checks for null
185	   and compares them. Loops until a mismatch or  null occurs.  */
186L(s1_align):
187	lvx	v4, 0, r3
188	GET16BYTES(v5, r4, v6)
189	VCMPNEZB(v7, v5, v4)
190	bne	cr6, L(different)
191	cmpldi	cr7, r5, 16
192	ble	cr7, L(ret0)
193	addi	r5, r5, -16
194	addi	r3, r3, 16
195	addi	r4, r4, 16
196
197	lvx	v4, 0, r3
198	GET16BYTES(v5, r4, v6)
199	VCMPNEZB(v7, v5, v4)
200	bne	cr6, L(different)
201	cmpldi	cr7, r5, 16
202	ble	cr7, L(ret0)
203	addi	r5, r5, -16
204	addi	r3, r3, 16
205	addi	r4, r4, 16
206
207	lvx	v4, 0, r3
208	GET16BYTES(v5, r4, v6)
209	VCMPNEZB(v7, v5, v4)
210	bne	cr6, L(different)
211	cmpldi	cr7, r5, 16
212	ble	cr7, L(ret0)
213	addi	r5, r5, -16
214	addi	r3, r3, 16
215	addi	r4, r4, 16
216
217	lvx	v4, 0, r3
218	GET16BYTES(v5, r4, v6)
219	VCMPNEZB(v7, v5, v4)
220	bne	cr6, L(different)
221	cmpldi	cr7, r5, 16
222	ble	cr7, L(ret0)
223	addi	r5, r5, -16
224	addi	r3, r3, 16
225	addi	r4, r4, 16
226	b	L(s1_align)
227	.align  4
228L(aligned):
229	lvx	v4, 0, r3
230	lvx	v5, 0, r4
231	VCMPNEZB(v7, v5, v4)
232	bne	cr6, L(different)
233	cmpldi	cr7, r5, 16
234	ble	cr7, L(ret0)
235	addi	r5, r5, -16
236	addi	r3, r3, 16
237	addi	r4, r4, 16
238
239	lvx	v4, 0, r3
240	lvx	v5, 0, r4
241	VCMPNEZB(v7, v5, v4)
242	bne	cr6, L(different)
243	cmpldi	cr7, r5, 16
244	ble	cr7, L(ret0)
245	addi	r5, r5, -16
246	addi	r3, r3, 16
247	addi	r4, r4, 16
248
249	lvx	v4, 0, r3
250	lvx	v5, 0, r4
251	VCMPNEZB(v7, v5, v4)
252	bne	cr6, L(different)
253	cmpldi	cr7, r5, 16
254	ble	cr7, L(ret0)
255	addi	r5, r5, -16
256	addi	r3, r3, 16
257	addi	r4, r4, 16
258
259	lvx	v4, 0, r3
260	lvx	v5, 0, r4
261	VCMPNEZB(v7, v5, v4)
262	bne	cr6, L(different)
263	cmpldi	cr7, r5, 16
264	ble	cr7, L(ret0)
265	addi	r5, r5, -16
266	addi	r3, r3, 16
267	addi	r4, r4, 16
268	b	L(aligned)
269	/* Calculate and return the difference.  */
270L(different):
271	VCTZLSBB(r6, v7)
272	cmplw	cr7, r5, r6
273	ble	cr7, L(ret0)
274	VEXTUBRX(r5, r6, v4)
275	VEXTUBRX(r4, r6, v5)
276	subf	r3, r4, r5
277	extsw	r3, r3
278	blr
279
280	.align 4
281L(ret0):
282	li	r9, 0
283L(ret1):
284	mr	r3, r9
285	blr
286
287	/* The code now checks if r8 and r5 are different by issuing a
288	   cmpb and shifts the result based on its output:
289
290	  leadzero = (__builtin_ffsl (z1) - 1);
291	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
292	  r1 = (r1 >> leadzero) & 0xFFUL;
293	  r2 = (r2 >> leadzero) & 0xFFUL;
294	  return r1 - r2;  */
295
296	.align 4
297L(different1):
298	neg	r11, r8
299	sldi	r5, r5, 3
300	and	r8, r11, r8
301	addi	r5, r5, -8
302	cntlzd	r8, r8
303	subfic	r8, r8, 63
304	extsw 	r8, r8
305	cmpld	cr7, r8, r5
306	ble	cr7, L(different2)
307	mr	r8, r5
308L(different2):
309	extsw	r8, r8
310	srd	r7, r7, r8
311	srd	r9, r9, r8
312	rldicl	r3, r7, 0, 56
313	rldicl	r9, r9, 0, 56
314	subf	r9, r9, 3
315	extsw	r9, r9
316	mr	r3, r9
317	blr
318
319	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
320	   a simple byte a byte comparison until the page alignment for s1
321	   is reached.  */
322	.align 4
323L(pagecross):
324	lbz	r7, 0(r3)
325	lbz	r9, 0(r4)
326	subfic	r8, r8,4095
327	cmplw	cr7, r9, r7
328	bne	cr7, L(byte_ne_3)
329	cmpdi	cr7, r9, 0
330	beq	cr7, L(byte_ne_0)
331	addi	r5, r5, -1
332	subf	r7, r8, r5
333	subf	r9, r7, r5
334	addi	r9, r9, 1
335	mtctr	r9
336	b	L(pagecross_loop1)
337
338	.align 4
339L(pagecross_loop0):
340	beq	cr7, L(ret0)
341	lbz	r9, 0(r3)
342	lbz	r8, 0(r4)
343	addi	r5, r5, -1
344	cmplw	cr7, r9, r8
345	cmpdi	cr5, r9, 0
346	bne	cr7, L(byte_ne_2)
347	beq	cr5, L(byte_ne_0)
348L(pagecross_loop1):
349	cmpdi	cr7, r5, 0
350	addi	r3, r3, 1
351	addi	r4, r4, 1
352	bdnz	L(pagecross_loop0)
353	cmpdi	cr7, r7, 0
354	li	r9, 0
355	bne+	cr7, L(align)
356	b	L(ret1)
357
358	.align 4
359L(byte_ne_0):
360	li	r7, 0
361L(byte_ne_1):
362	subf	r9, r9, r7
363	extsw	r9, r9
364	b	L(ret1)
365
366	.align 4
367L(byte_ne_2):
368	extsw	r7, r9
369	mr	r9, r8
370	b	L(byte_ne_1)
371L(byte_ne_3):
372	extsw	r7, r7
373	b	L(byte_ne_1)
374END(STRNCMP)
375libc_hidden_builtin_def(strncmp)
376