1/* Optimized strcmp implementation for PowerPC64/POWER9.
2   Copyright (C) 2016-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18#include <sysdep.h>
19
20#ifndef STRCMP
21# define STRCMP strcmp
22#endif
23
24/* Implements the function
25
26   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
27
28   The implementation uses unaligned doubleword access for first 32 bytes
29   as in POWER8 patch and uses vectorised loops after that.  */
30
31/* TODO: Change this to actual instructions when minimum binutils is upgraded
32   to 2.27.  Macros are defined below for these newer instructions in order
33   to maintain compatibility.  */
34#define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
35
36#define VEXTUBRX(t,a,b) .long (0x1000070d \
37				| ((t)<<(32-11))  \
38				| ((a)<<(32-16))  \
39				| ((b)<<(32-21)) )
40
41#define VCMPNEZB(t,a,b) .long (0x10000507 \
42				| ((t)<<(32-11))  \
43				| ((a)<<(32-16))  \
44				| ((b)<<(32-21)) )
45
46/* Get 16 bytes for unaligned case.
47   reg1: Vector to hold next 16 bytes.
48   reg2: Address to read from.
49   reg3: Permute control vector.  */
50#define GET16BYTES(reg1, reg2, reg3) \
51	lvx	reg1, 0, reg2; \
52	vperm	v8, v2, reg1, reg3; \
53	vcmpequb.	v8, v0, v8; \
54	beq	cr6, 1f; \
55	vspltisb	v9, 0; \
56	b	2f; \
57	.align 4; \
581: \
59	addi    r6, reg2, 16; \
60	lvx     v9, 0, r6; \
612: \
62	vperm   reg1, v9, reg1, reg3;
63
64/* TODO: change this to .machine power9 when the minimum required binutils
65   allows it.  */
66
67	.machine  power7
68ENTRY_TOCLESS (STRCMP, 4)
69	li	r0, 0
70
71	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
72	   the code:
73
74	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
75
76	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
77
78	rldicl	r7, r3, 0, 52
79	rldicl	r9, r4, 0, 52
80	cmpldi	cr7, r7, 4096-16
81	bgt	cr7, L(pagecross_check)
82	cmpldi	cr5, r9, 4096-16
83	bgt	cr5, L(pagecross_check)
84
85	/* For short strings up to 16 bytes,  load both s1 and s2 using
86	   unaligned dwords and compare.  */
87	ld	r8, 0(r3)
88	ld	r10, 0(r4)
89	cmpb	r12, r8, r0
90	cmpb	r11, r8, r10
91	orc.	r9, r12, r11
92	bne	cr0, L(different_nocmpb)
93
94	ld	r8, 8(r3)
95	ld	r10, 8(r4)
96	cmpb	r12, r8, r0
97	cmpb	r11, r8, r10
98	orc.	r9, r12, r11
99	bne	cr0, L(different_nocmpb)
100
101	addi	r7, r3, 16
102	addi	r4, r4, 16
103
104L(align):
105	/* Now it has checked for first 16 bytes.  */
106	vspltisb	v0, 0
107	vspltisb	v2, -1
108	lvsr	v6, 0, r4   /* Compute mask.  */
109	or	r5, r4, r7
110	andi.	r5, r5, 0xF
111	beq	cr0, L(aligned)
112	andi.	r5, r7, 0xF
113	beq	cr0, L(s1_align)
114	lvsr	v10, 0, r7   /* Compute mask.  */
115
116	/* Both s1 and s2 are unaligned.  */
117	GET16BYTES(v4, r7, v10)
118	GET16BYTES(v5, r4, v6)
119	VCMPNEZB(v7, v5, v4)
120	beq	cr6, L(match)
121	b	L(different)
122
123	/* Align s1 to qw and adjust s2 address.  */
124	.align  4
125L(match):
126	clrldi	r6, r7, 60
127	subfic	r5, r6, 16
128	add	r7, r7, r5
129	add	r4, r4, r5
130	andi.	r5, r4, 0xF
131	beq	cr0, L(aligned)
132	lvsr	v6, 0, r4
133	/* There are 2 loops depending on the input alignment.
134	   Each loop gets 16 bytes from s1 and s2 and compares.
135	   Loop until a mismatch or null occurs.  */
136L(s1_align):
137	lvx	v4, r7, r0
138	GET16BYTES(v5, r4, v6)
139	VCMPNEZB(v7, v5, v4)
140	addi	r7, r7, 16
141	addi	r4, r4, 16
142	bne	cr6, L(different)
143
144	lvx	v4, r7, r0
145	GET16BYTES(v5, r4, v6)
146	VCMPNEZB(v7, v5, v4)
147	addi	r7, r7, 16
148	addi	r4, r4, 16
149	bne	cr6, L(different)
150
151	lvx	v4, r7, r0
152	GET16BYTES(v5, r4, v6)
153	VCMPNEZB(v7, v5, v4)
154	addi	r7, r7, 16
155	addi	r4, r4, 16
156	bne	cr6, L(different)
157
158	lvx	v4, r7, r0
159	GET16BYTES(v5, r4, v6)
160	VCMPNEZB(v7, v5, v4)
161	addi	r7, r7, 16
162	addi	r4, r4, 16
163	beq	cr6, L(s1_align)
164	b	L(different)
165
166	.align  4
167L(aligned):
168	lvx	v4, 0, r7
169	lvx	v5, 0, r4
170	VCMPNEZB(v7, v5, v4)
171	addi	r7, r7, 16
172	addi	r4, r4, 16
173	bne	cr6, L(different)
174
175	lvx	v4, 0, r7
176	lvx	v5, 0, r4
177	VCMPNEZB(v7, v5, v4)
178	addi	r7, r7, 16
179	addi	r4, r4, 16
180	bne	cr6, L(different)
181
182	lvx	v4, 0, r7
183	lvx	v5, 0, r4
184	VCMPNEZB(v7, v5, v4)
185	addi	r7, r7, 16
186	addi	r4, r4, 16
187	bne	cr6, L(different)
188
189	lvx	v4, 0, r7
190	lvx	v5, 0, r4
191	VCMPNEZB(v7, v5, v4)
192	addi	r7, r7, 16
193	addi	r4, r4, 16
194	beq	cr6, L(aligned)
195
196	/* Calculate and return the difference.  */
197L(different):
198	VCTZLSBB(r6, v7)
199	VEXTUBRX(r5, r6, v4)
200	VEXTUBRX(r4, r6, v5)
201	subf	r3, r4, r5
202	extsw	r3, r3
203	blr
204
205	.align  4
206L(different_nocmpb):
207	neg	r3, r9
208	and	r9, r9, r3
209	cntlzd	r9, r9
210	subfic	r9, r9, 63
211	srd	r3, r8, r9
212	srd	r10, r10, r9
213	rldicl	r10, r10, 0, 56
214	rldicl	r3, r3, 0, 56
215	subf	r3, r10, r3
216	extsw	r3, r3
217	blr
218
219	.align	4
220L(pagecross_check):
221	subfic	r9, r9, 4096
222	subfic	r7, r7, 4096
223	cmpld	cr7, r7, r9
224	bge	cr7, L(pagecross)
225	mr	r7, r9
226
227	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
228	   a simple byte a byte comparison until the page alignment for s1
229	   is reached.  */
230L(pagecross):
231	add	r7, r3, r7
232	subf	r9, r3, r7
233	mtctr	r9
234
235	.align	4
236L(pagecross_loop):
237	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
238	   and if *s1 is '\0'.  */
239	lbz	r9, 0(r3)
240	lbz	r10, 0(r4)
241	addi	r3, r3, 1
242	addi	r4, r4, 1
243	cmplw	cr7, r9, r10
244	cmpdi	cr5, r9, r0
245	bne	cr7, L(pagecross_ne)
246	beq	cr5, L(pagecross_nullfound)
247	bdnz	L(pagecross_loop)
248	b	L(align)
249
250	.align	4
251L(pagecross_ne):
252	extsw	r3, r9
253	mr	r9, r10
254L(pagecross_retdiff):
255	subf	r9, r9, r3
256	extsw	r3, r9
257	blr
258
259	.align	4
260L(pagecross_nullfound):
261	li	r3, 0
262	b	L(pagecross_retdiff)
263END (STRCMP)
264libc_hidden_builtin_def (strcmp)
265