1/* Optimized strcmp implementation for PowerPC32.
2   Copyright (C) 2003-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22		    const char *s2 [r4],
23		    size_t size [r5])  */
24
25	.machine power4
26EALIGN (memcmp, 4, 0)
27	CALL_MCOUNT
28
29#define rRTN	r3
30#define rSTR1	r3	/* first string arg */
31#define rSTR2	r4	/* second string arg */
32#define rN	r5	/* max string length */
33#define rWORD1	r6	/* current word in s1 */
34#define rWORD2	r7	/* current word in s2 */
35#define rWORD3	r8	/* next word in s1 */
36#define rWORD4	r9	/* next word in s2 */
37#define rWORD5	r10	/* next word in s1 */
38#define rWORD6	r11	/* next word in s2 */
39#define rWORD7	r30	/* next word in s1 */
40#define rWORD8	r31	/* next word in s2 */
41
42	xor	r0, rSTR2, rSTR1
43	cmplwi	cr6, rN, 0
44	cmplwi	cr1, rN, 12
45	clrlwi.	r0, r0, 30
46	clrlwi	r12, rSTR1, 30
47	cmplwi	cr5, r12, 0
48	beq-	cr6, L(zeroLength)
49	dcbt	0, rSTR1
50	dcbt	0, rSTR2
51/* If less than 8 bytes or not aligned, use the unaligned
52   byte loop.  */
53	blt	cr1, L(bytealigned)
54	stwu	1, -64(r1)
55	cfi_adjust_cfa_offset(64)
56	stw	rWORD8, 48(r1)
57	stw	rWORD7, 44(r1)
58	cfi_offset(rWORD8, (48-64))
59	cfi_offset(rWORD7, (44-64))
60	bne	L(unaligned)
61/* At this point we know both strings have the same alignment and the
62   compare length is at least 8 bytes.  r12 contains the low order
63   2 bits of rSTR1 and cr5 contains the result of the logical compare
64   of r12 to 0.  If r12 == 0 then we are already word
65   aligned and can perform the word aligned loop.
66
67   Otherwise we know the two strings have the same alignment (but not
68   yet word aligned).  So we force the string addresses to the next lower
69   word boundary and special case this first word using shift left to
70   eliminate bits preceding the first byte.  Since we want to join the
71   normal (word aligned) compare loop, starting at the second word,
72   we need to adjust the length (rN) and special case the loop
73   versioning for the first word. This ensures that the loop count is
74   correct and the first word (shifted) is in the expected register pair. */
75	.align	4
76L(samealignment):
77	clrrwi	rSTR1, rSTR1, 2
78	clrrwi	rSTR2, rSTR2, 2
79	beq	cr5, L(Waligned)
80	add	rN, rN, r12
81	slwi	rWORD6, r12, 3
82	srwi	r0, rN, 4	/* Divide by 16 */
83	andi.	r12, rN, 12	/* Get the word remainder */
84#ifdef __LITTLE_ENDIAN__
85	lwbrx	rWORD1, 0, rSTR1
86	lwbrx	rWORD2, 0, rSTR2
87	addi	rSTR1, rSTR1, 4
88	addi	rSTR2, rSTR2, 4
89#else
90	lwz	rWORD1, 0(rSTR1)
91	lwz	rWORD2, 0(rSTR2)
92#endif
93	cmplwi	cr1, r12, 8
94	cmplwi	cr7, rN, 16
95	clrlwi	rN, rN, 30
96	beq	L(dPs4)
97	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
98	bgt	cr1, L(dPs3)
99	beq	cr1, L(dPs2)
100
101/* Remainder is 4 */
102	.align	3
103L(dsP1):
104	slw	rWORD5, rWORD1, rWORD6
105	slw	rWORD6, rWORD2, rWORD6
106	cmplw	cr5, rWORD5, rWORD6
107	blt	cr7, L(dP1x)
108/* Do something useful in this cycle since we have to branch anyway.  */
109#ifdef __LITTLE_ENDIAN__
110	lwbrx	rWORD1, 0, rSTR1
111	lwbrx	rWORD2, 0, rSTR2
112	addi	rSTR1, rSTR1, 4
113	addi	rSTR2, rSTR2, 4
114#else
115	lwz	rWORD1, 4(rSTR1)
116	lwz	rWORD2, 4(rSTR2)
117#endif
118	cmplw	cr7, rWORD1, rWORD2
119	b	L(dP1e)
120/* Remainder is 8 */
121	.align	4
122L(dPs2):
123	slw	rWORD5, rWORD1, rWORD6
124	slw	rWORD6, rWORD2, rWORD6
125	cmplw	cr6, rWORD5, rWORD6
126	blt	cr7, L(dP2x)
127/* Do something useful in this cycle since we have to branch anyway.  */
128#ifdef __LITTLE_ENDIAN__
129	lwbrx	rWORD7, 0, rSTR1
130	lwbrx	rWORD8, 0, rSTR2
131	addi	rSTR1, rSTR1, 4
132	addi	rSTR2, rSTR2, 4
133#else
134	lwz	rWORD7, 4(rSTR1)
135	lwz	rWORD8, 4(rSTR2)
136#endif
137	cmplw	cr5, rWORD7, rWORD8
138	b	L(dP2e)
139/* Remainder is 12 */
140	.align	4
141L(dPs3):
142	slw	rWORD3, rWORD1, rWORD6
143	slw	rWORD4, rWORD2, rWORD6
144	cmplw	cr1, rWORD3, rWORD4
145	b	L(dP3e)
146/* Count is a multiple of 16, remainder is 0 */
147	.align	4
148L(dPs4):
149	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
150	slw	rWORD1, rWORD1, rWORD6
151	slw	rWORD2, rWORD2, rWORD6
152	cmplw	cr7, rWORD1, rWORD2
153	b	L(dP4e)
154
155/* At this point we know both strings are word aligned and the
156   compare length is at least 8 bytes.  */
157	.align	4
158L(Waligned):
159	andi.	r12, rN, 12	/* Get the word remainder */
160	srwi	r0, rN, 4	/* Divide by 16 */
161	cmplwi	cr1, r12, 8
162	cmplwi	cr7, rN, 16
163	clrlwi	rN, rN, 30
164	beq	L(dP4)
165	bgt	cr1, L(dP3)
166	beq	cr1, L(dP2)
167
168/* Remainder is 4 */
169	.align	4
170L(dP1):
171	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
172/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
173   (8-15 byte compare), we want to use only volatile registers.  This
174   means we can avoid restoring non-volatile registers since we did not
175   change any on the early exit path.  The key here is the non-early
176   exit path only cares about the condition code (cr5), not about which
177   register pair was used.  */
178#ifdef __LITTLE_ENDIAN__
179	lwbrx	rWORD5, 0, rSTR1
180	lwbrx	rWORD6, 0, rSTR2
181	addi	rSTR1, rSTR1, 4
182	addi	rSTR2, rSTR2, 4
183#else
184	lwz	rWORD5, 0(rSTR1)
185	lwz	rWORD6, 0(rSTR2)
186#endif
187	cmplw	cr5, rWORD5, rWORD6
188	blt	cr7, L(dP1x)
189#ifdef __LITTLE_ENDIAN__
190	lwbrx	rWORD1, 0, rSTR1
191	lwbrx	rWORD2, 0, rSTR2
192	addi	rSTR1, rSTR1, 4
193	addi	rSTR2, rSTR2, 4
194#else
195	lwz	rWORD1, 4(rSTR1)
196	lwz	rWORD2, 4(rSTR2)
197#endif
198	cmplw	cr7, rWORD1, rWORD2
199L(dP1e):
200#ifdef __LITTLE_ENDIAN__
201	lwbrx	rWORD3, 0, rSTR1
202	lwbrx	rWORD4, 0, rSTR2
203	addi	rSTR1, rSTR1, 4
204	addi	rSTR2, rSTR2, 4
205#else
206	lwz	rWORD3, 8(rSTR1)
207	lwz	rWORD4, 8(rSTR2)
208#endif
209	cmplw	cr1, rWORD3, rWORD4
210#ifdef __LITTLE_ENDIAN__
211	lwbrx	rWORD5, 0, rSTR1
212	lwbrx	rWORD6, 0, rSTR2
213	addi	rSTR1, rSTR1, 4
214	addi	rSTR2, rSTR2, 4
215#else
216	lwz	rWORD5, 12(rSTR1)
217	lwz	rWORD6, 12(rSTR2)
218#endif
219	cmplw	cr6, rWORD5, rWORD6
220	bne	cr5, L(dLcr5x)
221	bne	cr7, L(dLcr7x)
222
223#ifdef __LITTLE_ENDIAN__
224	lwbrx	rWORD7, 0, rSTR1
225	lwbrx	rWORD8, 0, rSTR2
226	addi	rSTR1, rSTR1, 4
227	addi	rSTR2, rSTR2, 4
228#else
229	lwzu	rWORD7, 16(rSTR1)
230	lwzu	rWORD8, 16(rSTR2)
231#endif
232	bne	cr1, L(dLcr1)
233	cmplw	cr5, rWORD7, rWORD8
234	bdnz	L(dLoop)
235	bne	cr6, L(dLcr6)
236	lwz	rWORD7, 44(r1)
237	lwz	rWORD8, 48(r1)
238	.align	3
239L(dP1x):
240	slwi.	r12, rN, 3
241	bne	cr5, L(dLcr5x)
242	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
243	addi	1, 1, 64
244	cfi_adjust_cfa_offset(-64)
245	bne	L(d00)
246	li	rRTN, 0
247	blr
248
249/* Remainder is 8 */
250	.align	4
251	cfi_adjust_cfa_offset(64)
252L(dP2):
253	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
254#ifdef __LITTLE_ENDIAN__
255	lwbrx	rWORD5, 0, rSTR1
256	lwbrx	rWORD6, 0, rSTR2
257	addi	rSTR1, rSTR1, 4
258	addi	rSTR2, rSTR2, 4
259#else
260	lwz	rWORD5, 0(rSTR1)
261	lwz	rWORD6, 0(rSTR2)
262#endif
263	cmplw	cr6, rWORD5, rWORD6
264	blt	cr7, L(dP2x)
265#ifdef __LITTLE_ENDIAN__
266	lwbrx	rWORD7, 0, rSTR1
267	lwbrx	rWORD8, 0, rSTR2
268	addi	rSTR1, rSTR1, 4
269	addi	rSTR2, rSTR2, 4
270#else
271	lwz	rWORD7, 4(rSTR1)
272	lwz	rWORD8, 4(rSTR2)
273#endif
274	cmplw	cr5, rWORD7, rWORD8
275L(dP2e):
276#ifdef __LITTLE_ENDIAN__
277	lwbrx	rWORD1, 0, rSTR1
278	lwbrx	rWORD2, 0, rSTR2
279	addi	rSTR1, rSTR1, 4
280	addi	rSTR2, rSTR2, 4
281#else
282	lwz	rWORD1, 8(rSTR1)
283	lwz	rWORD2, 8(rSTR2)
284#endif
285	cmplw	cr7, rWORD1, rWORD2
286#ifdef __LITTLE_ENDIAN__
287	lwbrx	rWORD3, 0, rSTR1
288	lwbrx	rWORD4, 0, rSTR2
289	addi	rSTR1, rSTR1, 4
290	addi	rSTR2, rSTR2, 4
291#else
292	lwz	rWORD3, 12(rSTR1)
293	lwz	rWORD4, 12(rSTR2)
294#endif
295	cmplw	cr1, rWORD3, rWORD4
296#ifndef __LITTLE_ENDIAN__
297	addi	rSTR1, rSTR1, 4
298	addi	rSTR2, rSTR2, 4
299#endif
300	bne	cr6, L(dLcr6)
301	bne	cr5, L(dLcr5)
302	b	L(dLoop2)
303/* Again we are on a early exit path (16-23 byte compare), we want to
304   only use volatile registers and avoid restoring non-volatile
305   registers.  */
306	.align	4
307L(dP2x):
308#ifdef __LITTLE_ENDIAN__
309	lwbrx	rWORD3, 0, rSTR1
310	lwbrx	rWORD4, 0, rSTR2
311	addi	rSTR1, rSTR1, 4
312	addi	rSTR2, rSTR2, 4
313#else
314	lwz	rWORD3, 4(rSTR1)
315	lwz	rWORD4, 4(rSTR2)
316#endif
317	cmplw	cr1, rWORD3, rWORD4
318	slwi.	r12, rN, 3
319	bne	cr6, L(dLcr6x)
320#ifndef __LITTLE_ENDIAN__
321	addi	rSTR1, rSTR1, 4
322	addi	rSTR2, rSTR2, 4
323#endif
324	bne	cr1, L(dLcr1x)
325	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
326	addi	1, 1, 64
327	cfi_adjust_cfa_offset(-64)
328	bne	L(d00)
329	li	rRTN, 0
330	blr
331
332/* Remainder is 12 */
333	.align	4
334	cfi_adjust_cfa_offset(64)
335L(dP3):
336	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
337#ifdef __LITTLE_ENDIAN__
338	lwbrx	rWORD3, 0, rSTR1
339	lwbrx	rWORD4, 0, rSTR2
340	addi	rSTR1, rSTR1, 4
341	addi	rSTR2, rSTR2, 4
342#else
343	lwz	rWORD3, 0(rSTR1)
344	lwz	rWORD4, 0(rSTR2)
345#endif
346	cmplw	cr1, rWORD3, rWORD4
347L(dP3e):
348#ifdef __LITTLE_ENDIAN__
349	lwbrx	rWORD5, 0, rSTR1
350	lwbrx	rWORD6, 0, rSTR2
351	addi	rSTR1, rSTR1, 4
352	addi	rSTR2, rSTR2, 4
353#else
354	lwz	rWORD5, 4(rSTR1)
355	lwz	rWORD6, 4(rSTR2)
356#endif
357	cmplw	cr6, rWORD5, rWORD6
358	blt	cr7, L(dP3x)
359#ifdef __LITTLE_ENDIAN__
360	lwbrx	rWORD7, 0, rSTR1
361	lwbrx	rWORD8, 0, rSTR2
362	addi	rSTR1, rSTR1, 4
363	addi	rSTR2, rSTR2, 4
364#else
365	lwz	rWORD7, 8(rSTR1)
366	lwz	rWORD8, 8(rSTR2)
367#endif
368	cmplw	cr5, rWORD7, rWORD8
369#ifdef __LITTLE_ENDIAN__
370	lwbrx	rWORD1, 0, rSTR1
371	lwbrx	rWORD2, 0, rSTR2
372	addi	rSTR1, rSTR1, 4
373	addi	rSTR2, rSTR2, 4
374#else
375	lwz	rWORD1, 12(rSTR1)
376	lwz	rWORD2, 12(rSTR2)
377#endif
378	cmplw	cr7, rWORD1, rWORD2
379#ifndef __LITTLE_ENDIAN__
380	addi	rSTR1, rSTR1, 8
381	addi	rSTR2, rSTR2, 8
382#endif
383	bne	cr1, L(dLcr1)
384	bne	cr6, L(dLcr6)
385	b	L(dLoop1)
386/* Again we are on a early exit path (24-31 byte compare), we want to
387   only use volatile registers and avoid restoring non-volatile
388   registers.  */
389	.align	4
390L(dP3x):
391#ifdef __LITTLE_ENDIAN__
392	lwbrx	rWORD1, 0, rSTR1
393	lwbrx	rWORD2, 0, rSTR2
394	addi	rSTR1, rSTR1, 4
395	addi	rSTR2, rSTR2, 4
396#else
397	lwz	rWORD1, 8(rSTR1)
398	lwz	rWORD2, 8(rSTR2)
399#endif
400	cmplw	cr7, rWORD1, rWORD2
401	slwi.	r12, rN, 3
402	bne	cr1, L(dLcr1x)
403#ifndef __LITTLE_ENDIAN__
404	addi	rSTR1, rSTR1, 8
405	addi	rSTR2, rSTR2, 8
406#endif
407	bne	cr6, L(dLcr6x)
408	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
409	bne	cr7, L(dLcr7x)
410	addi	1, 1, 64
411	cfi_adjust_cfa_offset(-64)
412	bne	L(d00)
413	li	rRTN, 0
414	blr
415
416/* Count is a multiple of 16, remainder is 0 */
417	.align	4
418	cfi_adjust_cfa_offset(64)
419L(dP4):
420	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
421#ifdef __LITTLE_ENDIAN__
422	lwbrx	rWORD1, 0, rSTR1
423	lwbrx	rWORD2, 0, rSTR2
424	addi	rSTR1, rSTR1, 4
425	addi	rSTR2, rSTR2, 4
426#else
427	lwz	rWORD1, 0(rSTR1)
428	lwz	rWORD2, 0(rSTR2)
429#endif
430	cmplw	cr7, rWORD1, rWORD2
431L(dP4e):
432#ifdef __LITTLE_ENDIAN__
433	lwbrx	rWORD3, 0, rSTR1
434	lwbrx	rWORD4, 0, rSTR2
435	addi	rSTR1, rSTR1, 4
436	addi	rSTR2, rSTR2, 4
437#else
438	lwz	rWORD3, 4(rSTR1)
439	lwz	rWORD4, 4(rSTR2)
440#endif
441	cmplw	cr1, rWORD3, rWORD4
442#ifdef __LITTLE_ENDIAN__
443	lwbrx	rWORD5, 0, rSTR1
444	lwbrx	rWORD6, 0, rSTR2
445	addi	rSTR1, rSTR1, 4
446	addi	rSTR2, rSTR2, 4
447#else
448	lwz	rWORD5, 8(rSTR1)
449	lwz	rWORD6, 8(rSTR2)
450#endif
451	cmplw	cr6, rWORD5, rWORD6
452#ifdef __LITTLE_ENDIAN__
453	lwbrx	rWORD7, 0, rSTR1
454	lwbrx	rWORD8, 0, rSTR2
455	addi	rSTR1, rSTR1, 4
456	addi	rSTR2, rSTR2, 4
457#else
458	lwzu	rWORD7, 12(rSTR1)
459	lwzu	rWORD8, 12(rSTR2)
460#endif
461	cmplw	cr5, rWORD7, rWORD8
462	bne	cr7, L(dLcr7)
463	bne	cr1, L(dLcr1)
464	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
465/* This is the primary loop */
466	.align	4
467L(dLoop):
468#ifdef __LITTLE_ENDIAN__
469	lwbrx	rWORD1, 0, rSTR1
470	lwbrx	rWORD2, 0, rSTR2
471	addi	rSTR1, rSTR1, 4
472	addi	rSTR2, rSTR2, 4
473#else
474	lwz	rWORD1, 4(rSTR1)
475	lwz	rWORD2, 4(rSTR2)
476#endif
477	cmplw	cr1, rWORD3, rWORD4
478	bne	cr6, L(dLcr6)
479L(dLoop1):
480#ifdef __LITTLE_ENDIAN__
481	lwbrx	rWORD3, 0, rSTR1
482	lwbrx	rWORD4, 0, rSTR2
483	addi	rSTR1, rSTR1, 4
484	addi	rSTR2, rSTR2, 4
485#else
486	lwz	rWORD3, 8(rSTR1)
487	lwz	rWORD4, 8(rSTR2)
488#endif
489	cmplw	cr6, rWORD5, rWORD6
490	bne	cr5, L(dLcr5)
491L(dLoop2):
492#ifdef __LITTLE_ENDIAN__
493	lwbrx	rWORD5, 0, rSTR1
494	lwbrx	rWORD6, 0, rSTR2
495	addi	rSTR1, rSTR1, 4
496	addi	rSTR2, rSTR2, 4
497#else
498	lwz	rWORD5, 12(rSTR1)
499	lwz	rWORD6, 12(rSTR2)
500#endif
501	cmplw	cr5, rWORD7, rWORD8
502	bne	cr7, L(dLcr7)
503L(dLoop3):
504#ifdef __LITTLE_ENDIAN__
505	lwbrx	rWORD7, 0, rSTR1
506	lwbrx	rWORD8, 0, rSTR2
507	addi	rSTR1, rSTR1, 4
508	addi	rSTR2, rSTR2, 4
509#else
510	lwzu	rWORD7, 16(rSTR1)
511	lwzu	rWORD8, 16(rSTR2)
512#endif
513	bne-	cr1, L(dLcr1)
514	cmplw	cr7, rWORD1, rWORD2
515	bdnz+	L(dLoop)
516
517L(dL4):
518	cmplw	cr1, rWORD3, rWORD4
519	bne	cr6, L(dLcr6)
520	cmplw	cr6, rWORD5, rWORD6
521	bne	cr5, L(dLcr5)
522	cmplw	cr5, rWORD7, rWORD8
523L(d44):
524	bne	cr7, L(dLcr7)
525L(d34):
526	bne	cr1, L(dLcr1)
527L(d24):
528	bne	cr6, L(dLcr6)
529L(d14):
530	slwi.	r12, rN, 3
531	bne	cr5, L(dLcr5)
532L(d04):
533	lwz	rWORD7, 44(r1)
534	lwz	rWORD8, 48(r1)
535	addi	1, 1, 64
536	cfi_adjust_cfa_offset(-64)
537	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
538	beq	L(zeroLength)
539/* At this point we have a remainder of 1 to 3 bytes to compare.  Since
540   we are aligned it is safe to load the whole word, and use
541   shift right to eliminate bits beyond the compare length.  */
542L(d00):
543#ifdef __LITTLE_ENDIAN__
544	lwbrx	rWORD1, 0, rSTR1
545	lwbrx	rWORD2, 0, rSTR2
546	addi	rSTR1, rSTR1, 4
547	addi	rSTR2, rSTR2, 4
548#else
549	lwz	rWORD1, 4(rSTR1)
550	lwz	rWORD2, 4(rSTR2)
551#endif
552	srw	rWORD1, rWORD1, rN
553	srw	rWORD2, rWORD2, rN
554	sub	rRTN, rWORD1, rWORD2
555	blr
556
557	.align	4
558	cfi_adjust_cfa_offset(64)
559L(dLcr7):
560	lwz	rWORD7, 44(r1)
561	lwz	rWORD8, 48(r1)
562L(dLcr7x):
563	li	rRTN, 1
564	addi	1, 1, 64
565	cfi_adjust_cfa_offset(-64)
566	bgtlr	cr7
567	li	rRTN, -1
568	blr
569	.align	4
570	cfi_adjust_cfa_offset(64)
571L(dLcr1):
572	lwz	rWORD7, 44(r1)
573	lwz	rWORD8, 48(r1)
574L(dLcr1x):
575	li	rRTN, 1
576	addi	1, 1, 64
577	cfi_adjust_cfa_offset(-64)
578	bgtlr	cr1
579	li	rRTN, -1
580	blr
581	.align	4
582	cfi_adjust_cfa_offset(64)
583L(dLcr6):
584	lwz	rWORD7, 44(r1)
585	lwz	rWORD8, 48(r1)
586L(dLcr6x):
587	li	rRTN, 1
588	addi	1, 1, 64
589	cfi_adjust_cfa_offset(-64)
590	bgtlr	cr6
591	li	rRTN, -1
592	blr
593	.align	4
594	cfi_adjust_cfa_offset(64)
595L(dLcr5):
596	lwz	rWORD7, 44(r1)
597	lwz	rWORD8, 48(r1)
598L(dLcr5x):
599	li	rRTN, 1
600	addi	1, 1, 64
601	cfi_adjust_cfa_offset(-64)
602	bgtlr	cr5
603	li	rRTN, -1
604	blr
605
606	.align	4
607L(bytealigned):
608	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
609
610/* We need to prime this loop.  This loop is swing modulo scheduled
611   to avoid pipe delays.  The dependent instruction latencies (load to
612   compare to conditional branch) is 2 to 3 cycles.  In this loop each
613   dispatch group ends in a branch and takes 1 cycle.  Effectively
614   the first iteration of the loop only serves to load operands and
615   branches based on compares are delayed until the next loop.
616
617   So we must precondition some registers and condition codes so that
618   we don't exit the loop early on the first iteration.  */
619
620	lbz	rWORD1, 0(rSTR1)
621	lbz	rWORD2, 0(rSTR2)
622	bdz-	L(b11)
623	cmplw	cr7, rWORD1, rWORD2
624	lbz	rWORD3, 1(rSTR1)
625	lbz	rWORD4, 1(rSTR2)
626	bdz-	L(b12)
627	cmplw	cr1, rWORD3, rWORD4
628	lbzu	rWORD5, 2(rSTR1)
629	lbzu	rWORD6, 2(rSTR2)
630	bdz-	L(b13)
631	.align	4
632L(bLoop):
633	lbzu	rWORD1, 1(rSTR1)
634	lbzu	rWORD2, 1(rSTR2)
635	bne-	cr7, L(bLcr7)
636
637	cmplw	cr6, rWORD5, rWORD6
638	bdz-	L(b3i)
639
640	lbzu	rWORD3, 1(rSTR1)
641	lbzu	rWORD4, 1(rSTR2)
642	bne-	cr1, L(bLcr1)
643
644	cmplw	cr7, rWORD1, rWORD2
645	bdz-	L(b2i)
646
647	lbzu	rWORD5, 1(rSTR1)
648	lbzu	rWORD6, 1(rSTR2)
649	bne-	cr6, L(bLcr6)
650
651	cmplw	cr1, rWORD3, rWORD4
652	bdnz+	L(bLoop)
653
654/* We speculatively loading bytes before we have tested the previous
655   bytes.  But we must avoid overrunning the length (in the ctr) to
656   prevent these speculative loads from causing a segfault.  In this
657   case the loop will exit early (before the all pending bytes are
658   tested.  In this case we must complete the pending operations
659   before returning.  */
660L(b1i):
661	bne-	cr7, L(bLcr7)
662	bne-	cr1, L(bLcr1)
663	b	L(bx56)
664	.align	4
665L(b2i):
666	bne-	cr6, L(bLcr6)
667	bne-	cr7, L(bLcr7)
668	b	L(bx34)
669	.align	4
670L(b3i):
671	bne-	cr1, L(bLcr1)
672	bne-	cr6, L(bLcr6)
673	b	L(bx12)
674	.align	4
675L(bLcr7):
676	li	rRTN, 1
677	bgtlr	cr7
678	li	rRTN, -1
679	blr
680L(bLcr1):
681	li	rRTN, 1
682	bgtlr	cr1
683	li	rRTN, -1
684	blr
685L(bLcr6):
686	li	rRTN, 1
687	bgtlr	cr6
688	li	rRTN, -1
689	blr
690
691L(b13):
692	bne-	cr7, L(bx12)
693	bne-	cr1, L(bx34)
694L(bx56):
695	sub	rRTN, rWORD5, rWORD6
696	blr
697	nop
698L(b12):
699	bne-	cr7, L(bx12)
700L(bx34):
701	sub	rRTN, rWORD3, rWORD4
702	blr
703L(b11):
704L(bx12):
705	sub	rRTN, rWORD1, rWORD2
706	blr
707	.align	4
708L(zeroLength):
709	li	rRTN, 0
710	blr
711
712	.align	4
713/* At this point we know the strings have different alignment and the
714   compare length is at least 8 bytes.  r12 contains the low order
715   2 bits of rSTR1 and cr5 contains the result of the logical compare
716   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
717   perform the Wunaligned loop.
718
719   Otherwise we know that rSTR1 is not already word aligned yet.
720   So we can force the string addresses to the next lower word
721   boundary and special case this first word using shift left to
722   eliminate bits preceding the first byte.  Since we want to join the
723   normal (Wualigned) compare loop, starting at the second word,
724   we need to adjust the length (rN) and special case the loop
725   versioning for the first W. This ensures that the loop count is
726   correct and the first W (shifted) is in the expected resister pair.  */
727#define rSHL		r29	/* Unaligned shift left count.  */
728#define rSHR		r28	/* Unaligned shift right count.  */
729#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
730#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
731#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
732#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
733	cfi_adjust_cfa_offset(64)
734L(unaligned):
735	stw	rSHL, 40(r1)
736	cfi_offset(rSHL, (40-64))
737	clrlwi	rSHL, rSTR2, 30
738	stw	rSHR, 36(r1)
739	cfi_offset(rSHR, (36-64))
740	beq	cr5, L(Wunaligned)
741	stw	rWORD8_SHIFT, 32(r1)
742	cfi_offset(rWORD8_SHIFT, (32-64))
743/* Adjust the logical start of rSTR2 to compensate for the extra bits
744   in the 1st rSTR1 W.  */
745	sub	rWORD8_SHIFT, rSTR2, r12
746/* But do not attempt to address the W before that W that contains
747   the actual start of rSTR2.  */
748	clrrwi	rSTR2, rSTR2, 2
749	stw	rWORD2_SHIFT, 28(r1)
750/* Compute the left/right shift counts for the unaligned rSTR2,
751   compensating for the logical (W aligned) start of rSTR1.  */
752	clrlwi	rSHL, rWORD8_SHIFT, 30
753	clrrwi	rSTR1, rSTR1, 2
754	stw	rWORD4_SHIFT, 24(r1)
755	slwi	rSHL, rSHL, 3
756	cmplw	cr5, rWORD8_SHIFT, rSTR2
757	add	rN, rN, r12
758	slwi	rWORD6, r12, 3
759	stw	rWORD6_SHIFT, 20(r1)
760	cfi_offset(rWORD2_SHIFT, (28-64))
761	cfi_offset(rWORD4_SHIFT, (24-64))
762	cfi_offset(rWORD6_SHIFT, (20-64))
763	subfic	rSHR, rSHL, 32
764	srwi	r0, rN, 4	/* Divide by 16 */
765	andi.	r12, rN, 12	/* Get the W remainder */
766/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
767   this special case those bits may be discarded anyway.  Also we
768   must avoid loading a W where none of the bits are part of rSTR2 as
769   this may cross a page boundary and cause a page fault.  */
770	li	rWORD8, 0
771	blt	cr5, L(dus0)
772#ifdef __LITTLE_ENDIAN__
773	lwbrx	rWORD8, 0, rSTR2
774	addi	rSTR2, rSTR2, 4
775#else
776	lwz	rWORD8, 0(rSTR2)
777	addi	rSTR2, rSTR2, 4
778#endif
779	slw	rWORD8, rWORD8, rSHL
780
781L(dus0):
782#ifdef __LITTLE_ENDIAN__
783	lwbrx	rWORD1, 0, rSTR1
784	lwbrx	rWORD2, 0, rSTR2
785	addi	rSTR1, rSTR1, 4
786	addi	rSTR2, rSTR2, 4
787#else
788	lwz	rWORD1, 0(rSTR1)
789	lwz	rWORD2, 0(rSTR2)
790#endif
791	cmplwi	cr1, r12, 8
792	cmplwi	cr7, rN, 16
793	srw	r12, rWORD2, rSHR
794	clrlwi	rN, rN, 30
795	beq	L(duPs4)
796	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
797	or	rWORD8, r12, rWORD8
798	bgt	cr1, L(duPs3)
799	beq	cr1, L(duPs2)
800
801/* Remainder is 4 */
802	.align	4
803L(dusP1):
804	slw	rWORD8_SHIFT, rWORD2, rSHL
805	slw	rWORD7, rWORD1, rWORD6
806	slw	rWORD8, rWORD8, rWORD6
807	bge	cr7, L(duP1e)
808/* At this point we exit early with the first word compare
809   complete and remainder of 0 to 3 bytes.  See L(du14) for details on
810   how we handle the remaining bytes.  */
811	cmplw	cr5, rWORD7, rWORD8
812	slwi.	rN, rN, 3
813	bne	cr5, L(duLcr5)
814	cmplw	cr7, rN, rSHR
815	beq	L(duZeroReturn)
816	li	r0, 0
817	ble	cr7, L(dutrim)
818#ifdef __LITTLE_ENDIAN__
819	lwbrx	rWORD2, 0, rSTR2
820	addi	rSTR2, rSTR2, 4
821#else
822	lwz	rWORD2, 4(rSTR2)
823#endif
824	srw	r0, rWORD2, rSHR
825	b	L(dutrim)
826/* Remainder is 8 */
827	.align	4
828L(duPs2):
829	slw	rWORD6_SHIFT, rWORD2, rSHL
830	slw	rWORD5, rWORD1, rWORD6
831	slw	rWORD6, rWORD8, rWORD6
832	b	L(duP2e)
833/* Remainder is 12 */
834	.align	4
835L(duPs3):
836	slw	rWORD4_SHIFT, rWORD2, rSHL
837	slw	rWORD3, rWORD1, rWORD6
838	slw	rWORD4, rWORD8, rWORD6
839	b	L(duP3e)
840/* Count is a multiple of 16, remainder is 0 */
841	.align	4
842L(duPs4):
843	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
844	or	rWORD8, r12, rWORD8
845	slw	rWORD2_SHIFT, rWORD2, rSHL
846	slw	rWORD1, rWORD1, rWORD6
847	slw	rWORD2, rWORD8, rWORD6
848	b	L(duP4e)
849
850/* At this point we know rSTR1 is word aligned and the
851   compare length is at least 8 bytes.  */
852	.align	4
853L(Wunaligned):
854	stw	rWORD8_SHIFT, 32(r1)
855	clrrwi	rSTR2, rSTR2, 2
856	stw	rWORD2_SHIFT, 28(r1)
857	srwi	r0, rN, 4	/* Divide by 16 */
858	stw	rWORD4_SHIFT, 24(r1)
859	andi.	r12, rN, 12	/* Get the W remainder */
860	stw	rWORD6_SHIFT, 20(r1)
861	cfi_offset(rWORD8_SHIFT, (32-64))
862	cfi_offset(rWORD2_SHIFT, (28-64))
863	cfi_offset(rWORD4_SHIFT, (24-64))
864	cfi_offset(rWORD6_SHIFT, (20-64))
865	slwi	rSHL, rSHL, 3
866#ifdef __LITTLE_ENDIAN__
867	lwbrx	rWORD6, 0, rSTR2
868	addi	rSTR2, rSTR2, 4
869	lwbrx	rWORD8, 0, rSTR2
870	addi	rSTR2, rSTR2, 4
871#else
872	lwz	rWORD6, 0(rSTR2)
873	lwzu	rWORD8, 4(rSTR2)
874#endif
875	cmplwi	cr1, r12, 8
876	cmplwi	cr7, rN, 16
877	clrlwi	rN, rN, 30
878	subfic	rSHR, rSHL, 32
879	slw	rWORD6_SHIFT, rWORD6, rSHL
880	beq	L(duP4)
881	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
882	bgt	cr1, L(duP3)
883	beq	cr1, L(duP2)
884
885/* Remainder is 4 */
886	.align	4
887L(duP1):
888	srw	r12, rWORD8, rSHR
889#ifdef __LITTLE_ENDIAN__
890	lwbrx	rWORD7, 0, rSTR1
891	addi	rSTR1, rSTR1, 4
892#else
893	lwz	rWORD7, 0(rSTR1)
894#endif
895	slw	rWORD8_SHIFT, rWORD8, rSHL
896	or	rWORD8, r12, rWORD6_SHIFT
897	blt	cr7, L(duP1x)
898L(duP1e):
899#ifdef __LITTLE_ENDIAN__
900	lwbrx	rWORD1, 0, rSTR1
901	lwbrx	rWORD2, 0, rSTR2
902	addi	rSTR1, rSTR1, 4
903	addi	rSTR2, rSTR2, 4
904#else
905	lwz	rWORD1, 4(rSTR1)
906	lwz	rWORD2, 4(rSTR2)
907#endif
908	cmplw	cr5, rWORD7, rWORD8
909	srw	r0, rWORD2, rSHR
910	slw	rWORD2_SHIFT, rWORD2, rSHL
911	or	rWORD2, r0, rWORD8_SHIFT
912#ifdef __LITTLE_ENDIAN__
913	lwbrx	rWORD3, 0, rSTR1
914	lwbrx	rWORD4, 0, rSTR2
915	addi	rSTR1, rSTR1, 4
916	addi	rSTR2, rSTR2, 4
917#else
918	lwz	rWORD3, 8(rSTR1)
919	lwz	rWORD4, 8(rSTR2)
920#endif
921	cmplw	cr7, rWORD1, rWORD2
922	srw	r12, rWORD4, rSHR
923	slw	rWORD4_SHIFT, rWORD4, rSHL
924	bne	cr5, L(duLcr5)
925	or	rWORD4, r12, rWORD2_SHIFT
926#ifdef __LITTLE_ENDIAN__
927	lwbrx	rWORD5, 0, rSTR1
928	lwbrx	rWORD6, 0, rSTR2
929	addi	rSTR1, rSTR1, 4
930	addi	rSTR2, rSTR2, 4
931#else
932	lwz	rWORD5, 12(rSTR1)
933	lwz	rWORD6, 12(rSTR2)
934#endif
935	cmplw	cr1, rWORD3, rWORD4
936	srw	r0, rWORD6, rSHR
937	slw	rWORD6_SHIFT, rWORD6, rSHL
938	bne	cr7, L(duLcr7)
939	or	rWORD6, r0, rWORD4_SHIFT
940	cmplw	cr6, rWORD5, rWORD6
941	b	L(duLoop3)
942	.align	4
943/* At this point we exit early with the first word compare
944   complete and remainder of 0 to 3 bytes.  See L(du14) for details on
945   how we handle the remaining bytes.  */
946L(duP1x):
947	cmplw	cr5, rWORD7, rWORD8
948	slwi.	rN, rN, 3
949	bne	cr5, L(duLcr5)
950	cmplw	cr7, rN, rSHR
951	beq	L(duZeroReturn)
952	li	r0, 0
953	ble	cr7, L(dutrim)
954#ifdef __LITTLE_ENDIAN__
955	lwbrx	rWORD2, 0, rSTR2
956	addi	rSTR2, rSTR2, 4
957#else
958	lwz	rWORD2, 8(rSTR2)
959#endif
960	srw	r0, rWORD2, rSHR
961	b	L(dutrim)
962/* Remainder is 8 */
963	.align	4
964L(duP2):
965	srw	r0, rWORD8, rSHR
966#ifdef __LITTLE_ENDIAN__
967	lwbrx	rWORD5, 0, rSTR1
968	addi	rSTR1, rSTR1, 4
969#else
970	lwz	rWORD5, 0(rSTR1)
971#endif
972	or	rWORD6, r0, rWORD6_SHIFT
973	slw	rWORD6_SHIFT, rWORD8, rSHL
974L(duP2e):
975#ifdef __LITTLE_ENDIAN__
976	lwbrx	rWORD7, 0, rSTR1
977	lwbrx	rWORD8, 0, rSTR2
978	addi	rSTR1, rSTR1, 4
979	addi	rSTR2, rSTR2, 4
980#else
981	lwz	rWORD7, 4(rSTR1)
982	lwz	rWORD8, 4(rSTR2)
983#endif
984	cmplw	cr6, rWORD5, rWORD6
985	srw	r12, rWORD8, rSHR
986	slw	rWORD8_SHIFT, rWORD8, rSHL
987	or	rWORD8, r12, rWORD6_SHIFT
988	blt	cr7, L(duP2x)
989#ifdef __LITTLE_ENDIAN__
990	lwbrx	rWORD1, 0, rSTR1
991	lwbrx	rWORD2, 0, rSTR2
992	addi	rSTR1, rSTR1, 4
993	addi	rSTR2, rSTR2, 4
994#else
995	lwz	rWORD1, 8(rSTR1)
996	lwz	rWORD2, 8(rSTR2)
997#endif
998	cmplw	cr5, rWORD7, rWORD8
999	bne	cr6, L(duLcr6)
1000	srw	r0, rWORD2, rSHR
1001	slw	rWORD2_SHIFT, rWORD2, rSHL
1002	or	rWORD2, r0, rWORD8_SHIFT
1003#ifdef __LITTLE_ENDIAN__
1004	lwbrx	rWORD3, 0, rSTR1
1005	lwbrx	rWORD4, 0, rSTR2
1006	addi	rSTR1, rSTR1, 4
1007	addi	rSTR2, rSTR2, 4
1008#else
1009	lwz	rWORD3, 12(rSTR1)
1010	lwz	rWORD4, 12(rSTR2)
1011#endif
1012	cmplw	cr7, rWORD1, rWORD2
1013	bne	cr5, L(duLcr5)
1014	srw	r12, rWORD4, rSHR
1015	slw	rWORD4_SHIFT, rWORD4, rSHL
1016	or	rWORD4, r12, rWORD2_SHIFT
1017#ifndef __LITTLE_ENDIAN__
1018	addi	rSTR1, rSTR1, 4
1019	addi	rSTR2, rSTR2, 4
1020#endif
1021	cmplw	cr1, rWORD3, rWORD4
1022	b	L(duLoop2)
1023	.align	4
1024L(duP2x):
1025	cmplw	cr5, rWORD7, rWORD8
1026#ifndef __LITTLE_ENDIAN__
1027	addi	rSTR1, rSTR1, 4
1028	addi	rSTR2, rSTR2, 4
1029#endif
1030	bne	cr6, L(duLcr6)
1031	slwi.	rN, rN, 3
1032	bne	cr5, L(duLcr5)
1033	cmplw	cr7, rN, rSHR
1034	beq	L(duZeroReturn)
1035	li	r0, 0
1036	ble	cr7, L(dutrim)
1037#ifdef __LITTLE_ENDIAN__
1038	lwbrx	rWORD2, 0, rSTR2
1039	addi	rSTR2, rSTR2, 4
1040#else
1041	lwz	rWORD2, 4(rSTR2)
1042#endif
1043	srw	r0, rWORD2, rSHR
1044	b	L(dutrim)
1045
1046/* Remainder is 12 */
1047	.align	4
1048L(duP3):
1049	srw	r12, rWORD8, rSHR
1050#ifdef __LITTLE_ENDIAN__
1051	lwbrx	rWORD3, 0, rSTR1
1052	addi	rSTR1, rSTR1, 4
1053#else
1054	lwz	rWORD3, 0(rSTR1)
1055#endif
1056	slw	rWORD4_SHIFT, rWORD8, rSHL
1057	or	rWORD4, r12, rWORD6_SHIFT
1058L(duP3e):
1059#ifdef __LITTLE_ENDIAN__
1060	lwbrx	rWORD5, 0, rSTR1
1061	lwbrx	rWORD6, 0, rSTR2
1062	addi	rSTR1, rSTR1, 4
1063	addi	rSTR2, rSTR2, 4
1064#else
1065	lwz	rWORD5, 4(rSTR1)
1066	lwz	rWORD6, 4(rSTR2)
1067#endif
1068	cmplw	cr1, rWORD3, rWORD4
1069	srw	r0, rWORD6, rSHR
1070	slw	rWORD6_SHIFT, rWORD6, rSHL
1071	or	rWORD6, r0, rWORD4_SHIFT
1072#ifdef __LITTLE_ENDIAN__
1073	lwbrx	rWORD7, 0, rSTR1
1074	lwbrx	rWORD8, 0, rSTR2
1075	addi	rSTR1, rSTR1, 4
1076	addi	rSTR2, rSTR2, 4
1077#else
1078	lwz	rWORD7, 8(rSTR1)
1079	lwz	rWORD8, 8(rSTR2)
1080#endif
1081	cmplw	cr6, rWORD5, rWORD6
1082	bne	cr1, L(duLcr1)
1083	srw	r12, rWORD8, rSHR
1084	slw	rWORD8_SHIFT, rWORD8, rSHL
1085	or	rWORD8, r12, rWORD6_SHIFT
1086	blt	cr7, L(duP3x)
1087#ifdef __LITTLE_ENDIAN__
1088	lwbrx	rWORD1, 0, rSTR1
1089	lwbrx	rWORD2, 0, rSTR2
1090	addi	rSTR1, rSTR1, 4
1091	addi	rSTR2, rSTR2, 4
1092#else
1093	lwz	rWORD1, 12(rSTR1)
1094	lwz	rWORD2, 12(rSTR2)
1095#endif
1096	cmplw	cr5, rWORD7, rWORD8
1097	bne	cr6, L(duLcr6)
1098	srw	r0, rWORD2, rSHR
1099	slw	rWORD2_SHIFT, rWORD2, rSHL
1100	or	rWORD2, r0, rWORD8_SHIFT
1101#ifndef __LITTLE_ENDIAN__
1102	addi	rSTR1, rSTR1, 8
1103	addi	rSTR2, rSTR2, 8
1104#endif
1105	cmplw	cr7, rWORD1, rWORD2
1106	b	L(duLoop1)
1107	.align	4
1108L(duP3x):
1109#ifndef __LITTLE_ENDIAN__
1110	addi	rSTR1, rSTR1, 8
1111	addi	rSTR2, rSTR2, 8
1112#endif
1113#if 0
1114/* Huh?  We've already branched on cr1!  */
1115	bne	cr1, L(duLcr1)
1116#endif
1117	cmplw	cr5, rWORD7, rWORD8
1118	bne	cr6, L(duLcr6)
1119	slwi.	rN, rN, 3
1120	bne	cr5, L(duLcr5)
1121	cmplw	cr7, rN, rSHR
1122	beq	L(duZeroReturn)
1123	li	r0, 0
1124	ble	cr7, L(dutrim)
1125#ifdef __LITTLE_ENDIAN__
1126	lwbrx	rWORD2, 0, rSTR2
1127	addi	rSTR2, rSTR2, 4
1128#else
1129	lwz	rWORD2, 4(rSTR2)
1130#endif
1131	srw	r0, rWORD2, rSHR
1132	b	L(dutrim)
1133
1134/* Count is a multiple of 16, remainder is 0 */
1135	.align	4
1136L(duP4):
1137	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
1138	srw	r0, rWORD8, rSHR
1139#ifdef __LITTLE_ENDIAN__
1140	lwbrx	rWORD1, 0, rSTR1
1141	addi	rSTR1, rSTR1, 4
1142#else
1143	lwz	rWORD1, 0(rSTR1)
1144#endif
1145	slw	rWORD2_SHIFT, rWORD8, rSHL
1146	or	rWORD2, r0, rWORD6_SHIFT
1147L(duP4e):
1148#ifdef __LITTLE_ENDIAN__
1149	lwbrx	rWORD3, 0, rSTR1
1150	lwbrx	rWORD4, 0, rSTR2
1151	addi	rSTR1, rSTR1, 4
1152	addi	rSTR2, rSTR2, 4
1153#else
1154	lwz	rWORD3, 4(rSTR1)
1155	lwz	rWORD4, 4(rSTR2)
1156#endif
1157	cmplw	cr7, rWORD1, rWORD2
1158	srw	r12, rWORD4, rSHR
1159	slw	rWORD4_SHIFT, rWORD4, rSHL
1160	or	rWORD4, r12, rWORD2_SHIFT
1161#ifdef __LITTLE_ENDIAN__
1162	lwbrx	rWORD5, 0, rSTR1
1163	lwbrx	rWORD6, 0, rSTR2
1164	addi	rSTR1, rSTR1, 4
1165	addi	rSTR2, rSTR2, 4
1166#else
1167	lwz	rWORD5, 8(rSTR1)
1168	lwz	rWORD6, 8(rSTR2)
1169#endif
1170	cmplw	cr1, rWORD3, rWORD4
1171	bne	cr7, L(duLcr7)
1172	srw	r0, rWORD6, rSHR
1173	slw	rWORD6_SHIFT, rWORD6, rSHL
1174	or	rWORD6, r0, rWORD4_SHIFT
1175#ifdef __LITTLE_ENDIAN__
1176	lwbrx	rWORD7, 0, rSTR1
1177	lwbrx	rWORD8, 0, rSTR2
1178	addi	rSTR1, rSTR1, 4
1179	addi	rSTR2, rSTR2, 4
1180#else
1181	lwzu	rWORD7, 12(rSTR1)
1182	lwzu	rWORD8, 12(rSTR2)
1183#endif
1184	cmplw	cr6, rWORD5, rWORD6
1185	bne	cr1, L(duLcr1)
1186	srw	r12, rWORD8, rSHR
1187	slw	rWORD8_SHIFT, rWORD8, rSHL
1188	or	rWORD8, r12, rWORD6_SHIFT
1189	cmplw	cr5, rWORD7, rWORD8
1190	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
1191/* This is the primary loop */
1192	.align	4
1193L(duLoop):
1194#ifdef __LITTLE_ENDIAN__
1195	lwbrx	rWORD1, 0, rSTR1
1196	lwbrx	rWORD2, 0, rSTR2
1197	addi	rSTR1, rSTR1, 4
1198	addi	rSTR2, rSTR2, 4
1199#else
1200	lwz	rWORD1, 4(rSTR1)
1201	lwz	rWORD2, 4(rSTR2)
1202#endif
1203	cmplw	cr1, rWORD3, rWORD4
1204	bne	cr6, L(duLcr6)
1205	srw	r0, rWORD2, rSHR
1206	slw	rWORD2_SHIFT, rWORD2, rSHL
1207	or	rWORD2, r0, rWORD8_SHIFT
1208L(duLoop1):
1209#ifdef __LITTLE_ENDIAN__
1210	lwbrx	rWORD3, 0, rSTR1
1211	lwbrx	rWORD4, 0, rSTR2
1212	addi	rSTR1, rSTR1, 4
1213	addi	rSTR2, rSTR2, 4
1214#else
1215	lwz	rWORD3, 8(rSTR1)
1216	lwz	rWORD4, 8(rSTR2)
1217#endif
1218	cmplw	cr6, rWORD5, rWORD6
1219	bne	cr5, L(duLcr5)
1220	srw	r12, rWORD4, rSHR
1221	slw	rWORD4_SHIFT, rWORD4, rSHL
1222	or	rWORD4, r12, rWORD2_SHIFT
1223L(duLoop2):
1224#ifdef __LITTLE_ENDIAN__
1225	lwbrx	rWORD5, 0, rSTR1
1226	lwbrx	rWORD6, 0, rSTR2
1227	addi	rSTR1, rSTR1, 4
1228	addi	rSTR2, rSTR2, 4
1229#else
1230	lwz	rWORD5, 12(rSTR1)
1231	lwz	rWORD6, 12(rSTR2)
1232#endif
1233	cmplw	cr5, rWORD7, rWORD8
1234	bne	cr7, L(duLcr7)
1235	srw	r0, rWORD6, rSHR
1236	slw	rWORD6_SHIFT, rWORD6, rSHL
1237	or	rWORD6, r0, rWORD4_SHIFT
1238L(duLoop3):
1239#ifdef __LITTLE_ENDIAN__
1240	lwbrx	rWORD7, 0, rSTR1
1241	lwbrx	rWORD8, 0, rSTR2
1242	addi	rSTR1, rSTR1, 4
1243	addi	rSTR2, rSTR2, 4
1244#else
1245	lwzu	rWORD7, 16(rSTR1)
1246	lwzu	rWORD8, 16(rSTR2)
1247#endif
1248	cmplw	cr7, rWORD1, rWORD2
1249	bne-	cr1, L(duLcr1)
1250	srw	r12, rWORD8, rSHR
1251	slw	rWORD8_SHIFT, rWORD8, rSHL
1252	or	rWORD8, r12, rWORD6_SHIFT
1253	bdnz+	L(duLoop)
1254
1255L(duL4):
1256#if 0
1257/* Huh?  We've already branched on cr1!  */
1258	bne	cr1, L(duLcr1)
1259#endif
1260	cmplw	cr1, rWORD3, rWORD4
1261	bne	cr6, L(duLcr6)
1262	cmplw	cr6, rWORD5, rWORD6
1263	bne	cr5, L(duLcr5)
1264	cmplw	cr5, rWORD7, rWORD8
1265L(du44):
1266	bne	cr7, L(duLcr7)
1267L(du34):
1268	bne	cr1, L(duLcr1)
1269L(du24):
1270	bne	cr6, L(duLcr6)
1271L(du14):
1272	slwi.	rN, rN, 3
1273	bne	cr5, L(duLcr5)
1274/* At this point we have a remainder of 1 to 3 bytes to compare.  We use
1275   shift right to eliminate bits beyond the compare length.
1276   This allows the use of word subtract to compute the final result.
1277
1278   However it may not be safe to load rWORD2 which may be beyond the
1279   string length. So we compare the bit length of the remainder to
1280   the right shift count (rSHR). If the bit count is less than or equal
1281   we do not need to load rWORD2 (all significant bits are already in
1282   rWORD8_SHIFT).  */
1283	cmplw	cr7, rN, rSHR
1284	beq	L(duZeroReturn)
1285	li	r0, 0
1286	ble	cr7, L(dutrim)
1287#ifdef __LITTLE_ENDIAN__
1288	lwbrx	rWORD2, 0, rSTR2
1289	addi	rSTR2, rSTR2, 4
1290#else
1291	lwz	rWORD2, 4(rSTR2)
1292#endif
1293	srw	r0, rWORD2, rSHR
1294	.align	4
1295L(dutrim):
1296#ifdef __LITTLE_ENDIAN__
1297	lwbrx	rWORD1, 0, rSTR1
1298#else
1299	lwz	rWORD1, 4(rSTR1)
1300#endif
1301	lwz	rWORD8, 48(r1)
1302	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
1303	or	rWORD2, r0, rWORD8_SHIFT
1304	lwz	rWORD7, 44(r1)
1305	lwz	rSHL, 40(r1)
1306	srw	rWORD1, rWORD1, rN
1307	srw	rWORD2, rWORD2, rN
1308	lwz	rSHR, 36(r1)
1309	lwz	rWORD8_SHIFT, 32(r1)
1310	sub	rRTN, rWORD1, rWORD2
1311	b	L(dureturn26)
1312	.align	4
1313L(duLcr7):
1314	lwz	rWORD8, 48(r1)
1315	lwz	rWORD7, 44(r1)
1316	li	rRTN, 1
1317	bgt	cr7, L(dureturn29)
1318	lwz	rSHL, 40(r1)
1319	lwz	rSHR, 36(r1)
1320	li	rRTN, -1
1321	b	L(dureturn27)
1322	.align	4
1323L(duLcr1):
1324	lwz	rWORD8, 48(r1)
1325	lwz	rWORD7, 44(r1)
1326	li	rRTN, 1
1327	bgt	cr1, L(dureturn29)
1328	lwz	rSHL, 40(r1)
1329	lwz	rSHR, 36(r1)
1330	li	rRTN, -1
1331	b	L(dureturn27)
1332	.align	4
1333L(duLcr6):
1334	lwz	rWORD8, 48(r1)
1335	lwz	rWORD7, 44(r1)
1336	li	rRTN, 1
1337	bgt	cr6, L(dureturn29)
1338	lwz	rSHL, 40(r1)
1339	lwz	rSHR, 36(r1)
1340	li	rRTN, -1
1341	b	L(dureturn27)
1342	.align	4
1343L(duLcr5):
1344	lwz	rWORD8, 48(r1)
1345	lwz	rWORD7, 44(r1)
1346	li	rRTN, 1
1347	bgt	cr5, L(dureturn29)
1348	lwz	rSHL, 40(r1)
1349	lwz	rSHR, 36(r1)
1350	li	rRTN, -1
1351	b	L(dureturn27)
1352	.align	3
1353L(duZeroReturn):
1354	li	rRTN, 0
1355	.align	4
1356L(dureturn):
1357	lwz	rWORD8, 48(r1)
1358	lwz	rWORD7, 44(r1)
1359L(dureturn29):
1360	lwz	rSHL, 40(r1)
1361	lwz	rSHR, 36(r1)
1362L(dureturn27):
1363	lwz	rWORD8_SHIFT, 32(r1)
1364L(dureturn26):
1365	lwz	rWORD2_SHIFT, 28(r1)
1366L(dureturn25):
1367	lwz	rWORD4_SHIFT, 24(r1)
1368	lwz	rWORD6_SHIFT, 20(r1)
1369	addi	1, 1, 64
1370	cfi_adjust_cfa_offset(-64)
1371	blr
1372END (memcmp)
1373
1374libc_hidden_builtin_def (memcmp)
1375weak_alias (memcmp, bcmp)
1376strong_alias (memcmp, __memcmpeq)
1377libc_hidden_def (__memcmpeq)
1378