1/* Optimized strcmp implementation for PowerPC64/POWER8.
2   Copyright (C) 2015-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef STRCMP
22# define STRCMP strcmp
23#endif
24
25/* Implements the function
26
27   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
28
29   The implementation uses unaligned doubleword access to avoid specialized
30   code paths depending of data alignment.  Although recent powerpc64 uses
31   64K as default, the page cross handling assumes minimum page size of
32   4k.  */
33
34	.machine power8
35ENTRY_TOCLESS (STRCMP, 4)
36	li	r0,0
37
38	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
39	   the code:
40
41	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
42
43	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
44
45	rldicl	r7,r3,0,52
46	rldicl	r9,r4,0,52
47	cmpldi	cr7,r7,4096-16
48	bgt	cr7,L(pagecross_check)
49	cmpldi	cr5,r9,4096-16
50	bgt	cr5,L(pagecross_check)
51
52	/* For short string up to 16 bytes, load both s1 and s2 using
53	   unaligned dwords and compare.  */
54	ld	r8,0(r3)
55	ld	r10,0(r4)
56	cmpb	r12,r8,r0
57	cmpb	r11,r8,r10
58	orc.	r9,r12,r11
59	bne	cr0,L(different_nocmpb)
60
61	ld	r8,8(r3)
62	ld	r10,8(r4)
63	cmpb	r12,r8,r0
64	cmpb	r11,r8,r10
65	orc.	r9,r12,r11
66	bne	cr0,L(different_nocmpb)
67
68	addi	r7,r3,16
69	addi	r4,r4,16
70
71L(align_8b):
72	/* Now it has checked for first 16 bytes, align source1 to doubleword
73	   and adjust source2 address.  */
74	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
75	subf	r4,r9,r4	/* Adjust source2 address based on source1
76				   alignment.  */
77	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
78
79	/* At this point, source1 alignment is 0 and source2 alignment is
80	   between 0 and 7.  Check is source2 alignment is 0, meaning both
81	   sources have the same alignment.  */
82	andi.	r9,r4,0x7
83	bne	cr0,L(loop_diff_align)
84
85	/* If both source1 and source2 are doubleword aligned, there is no
86	   need for page boundary cross checks.  */
87
88	ld	r8,0(r7)
89	ld	r10,0(r4)
90	cmpb	r12,r8,r0
91	cmpb	r11,r8,r10
92	orc.	r9,r12,r11
93	bne	cr0,L(different_nocmpb)
94
95	.align 4
96L(loop_equal_align):
97	ld	r8,8(r7)
98	ld	r10,8(r4)
99	cmpb	r12,r8,r0
100	cmpb	r11,r8,r10
101	orc.	r9,r12,r11
102	bne	cr0,L(different_nocmpb)
103
104	ld	r8,16(r7)
105	ld	r10,16(r4)
106	cmpb	r12,r8,r0
107	cmpb	r11,r8,r10
108	orc.	r9,r12,r11
109	bne	cr0,L(different_nocmpb)
110
111	ldu	r8,24(r7)
112	ldu	r10,24(r4)
113	cmpb	r12,r8,r0
114	cmpb	r11,r8,r10
115	orc.	r9,r12,r11
116	bne	cr0,L(different_nocmpb)
117
118	b	L(loop_equal_align)
119
120	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
121	   result and r10 the dword from s2.  To code isolate the byte
122	   up to end (including the '\0'), masking with 0xFF the remaining
123	   ones:
124
125           #if __LITTLE_ENDIAN__
126	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
127	     r9 = (__builtin_ffsl (r9) - 1) + 8;
128	     r9 = -1UL << r9
129	   #else
130	     r9  = __builtin_clzl (r9) + 8;
131	     r9  = -1UL >> r9
132	   #endif
133	     r8  = r8  | r9
134	     r10 = r10 | r9  */
135
136#ifdef __LITTLE_ENDIAN__
137	nor 	r9,r9,r9
138L(different_nocmpb):
139	neg	r3,r9
140	and	r9,r9,r3
141	cntlzd	r9,r9
142	subfic	r9,r9,63
143#else
144	not	r9,r9
145L(different_nocmpb):
146	cntlzd	r9,r9
147	subfic	r9,r9,56
148#endif
149	srd	r3,r8,r9
150	srd	r10,r10,r9
151	rldicl	r10,r10,0,56
152	rldicl	r3,r3,0,56
153	subf	r3,r10,r3
154	extsw	r3,r3
155	blr
156
157	.align	4
158L(pagecross_check):
159	subfic	r9,r9,4096
160	subfic	r7,r7,4096
161	cmpld	cr7,r7,r9
162	bge	cr7,L(pagecross)
163	mr	r7,r9
164
165	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
166	   a simple byte a byte comparison until the page alignment for s1
167	   is reached.  */
168L(pagecross):
169	add	r7,r3,r7
170	subf	r9,r3,r7
171	mtctr	r9
172
173	.align	4
174L(pagecross_loop):
175	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
176	   and if *s1 is '\0'.  */
177	lbz	r9,0(r3)
178	lbz	r10,0(r4)
179	addi	r3,r3,1
180	addi	r4,r4,1
181	cmplw	cr7,r9,r10
182	cmpdi	cr5,r9,r0
183	bne	cr7,L(pagecross_ne)
184	beq	cr5,L(pagecross_nullfound)
185	bdnz	L(pagecross_loop)
186	b	L(align_8b)
187
188	.align	4
189	/* The unaligned read of source2 will cross a 4K page boundary,
190	   and the different byte or NULL maybe be in the remaining page
191	   bytes. Since it can not use the unaligned load, the algorithm
192	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
193L(check_source2_byte):
194	li	r9,8
195	mtctr	r9
196
197	.align	4
198L(check_source2_byte_loop):
199	lbz	r9,0(r7)
200	lbz	r10,0(r4)
201	addi	r7,r7,1
202	addi	r4,r4,1
203	cmplw	cr7,r9,10
204	cmpdi	r5,r9,0
205	bne	cr7,L(pagecross_ne)
206	beq	cr5,L(pagecross_nullfound)
207	bdnz	L(check_source2_byte_loop)
208
209	/* If source2 is unaligned to doubleword, the code needs to check
210	   on each interation if the unaligned doubleword access will cross
211	   a 4k page boundary.  */
212	.align	5
213L(loop_unaligned):
214	ld	r8,0(r7)
215	ld	r10,0(r4)
216	cmpb	r12,r8,r0
217	cmpb	r11,r8,r10
218	orc.	r9,r12,r11
219	bne	cr0,L(different_nocmpb)
220	addi	r7,r7,8
221	addi	r4,r4,8
222
223L(loop_diff_align):
224	/* Check if [src2]+8 cross a 4k page boundary:
225
226	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
227
228	     with PAGE_SIZE being 4096.  */
229	rldicl	r9,r4,0,52
230	cmpldi	cr7,r9,4088
231	ble	cr7,L(loop_unaligned)
232	b	L(check_source2_byte)
233
234	.align	4
235L(pagecross_ne):
236	extsw	r3,r9
237	mr	r9,r10
238L(pagecross_retdiff):
239	subf	r9,r9,r3
240	extsw	r3,r9
241	blr
242
243	.align	4
244L(pagecross_nullfound):
245	li	r3,0
246	b	L(pagecross_retdiff)
247END (STRCMP)
248libc_hidden_builtin_def (strcmp)
249