1/* Optimized memcmp implementation for PowerPC64.
2   Copyright (C) 2003-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22		    const char *s2 [r4],
23		    size_t size [r5])  */
24
25#ifndef MEMCMP
26# define MEMCMP memcmp
27#endif
28
29#ifndef __LITTLE_ENDIAN__
30	.machine power4
31#else
32/* Little endian is only available since POWER8, so it's safe to
33   specify .machine as power8 (or older), even though this is a POWER4
34   file.  Since the little-endian code uses 'ldbrx', power7 is enough. */
35	.machine power7
36#endif
37ENTRY_TOCLESS (MEMCMP, 4)
38	CALL_MCOUNT 3
39
40#define rRTN	r3
41#define rSTR1	r3	/* first string arg */
42#define rSTR2	r4	/* second string arg */
43#define rN	r5	/* max string length */
44#define rWORD1	r6	/* current word in s1 */
45#define rWORD2	r7	/* current word in s2 */
46#define rWORD3	r8	/* next word in s1 */
47#define rWORD4	r9	/* next word in s2 */
48#define rWORD5	r10	/* next word in s1 */
49#define rWORD6	r11	/* next word in s2 */
50#define rWORD7	r30	/* next word in s1 */
51#define rWORD8	r31	/* next word in s2 */
52
53	xor	r0, rSTR2, rSTR1
54	cmpldi	cr6, rN, 0
55	cmpldi	cr1, rN, 12
56	clrldi.	r0, r0, 61
57	clrldi	r12, rSTR1, 61
58	cmpldi	cr5, r12, 0
59	beq-	cr6, L(zeroLength)
60	dcbt	0, rSTR1
61	dcbt	0, rSTR2
62/* If less than 8 bytes or not aligned, use the unaligned
63   byte loop.  */
64	blt	cr1, L(bytealigned)
65	std	rWORD8, -8(r1)
66	std	rWORD7, -16(r1)
67	cfi_offset(rWORD8, -8)
68	cfi_offset(rWORD7, -16)
69	bne	L(unaligned)
70/* At this point we know both strings have the same alignment and the
71   compare length is at least 8 bytes.  r12 contains the low order
72   3 bits of rSTR1 and cr5 contains the result of the logical compare
73   of r12 to 0.  If r12 == 0 then we are already double word
74   aligned and can perform the DW aligned loop.
75
76   Otherwise we know the two strings have the same alignment (but not
77   yet DW).  So we force the string addresses to the next lower DW
78   boundary and special case this first DW using shift left to
79   eliminate bits preceding the first byte.  Since we want to join the
80   normal (DW aligned) compare loop, starting at the second double word,
81   we need to adjust the length (rN) and special case the loop
82   versioning for the first DW. This ensures that the loop count is
83   correct and the first DW (shifted) is in the expected register pair.  */
84	.align	4
85L(samealignment):
86	clrrdi	rSTR1, rSTR1, 3
87	clrrdi	rSTR2, rSTR2, 3
88	beq	cr5, L(DWaligned)
89	add	rN, rN, r12
90	sldi	rWORD6, r12, 3
91	srdi	r0, rN, 5	/* Divide by 32 */
92	andi.	r12, rN, 24	/* Get the DW remainder */
93#ifdef __LITTLE_ENDIAN__
94	ldbrx	rWORD1, 0, rSTR1
95	ldbrx	rWORD2, 0, rSTR2
96	addi	rSTR1, rSTR1, 8
97	addi	rSTR2, rSTR2, 8
98#else
99	ld	rWORD1, 0(rSTR1)
100	ld	rWORD2, 0(rSTR2)
101#endif
102	cmpldi	cr1, r12, 16
103	cmpldi	cr7, rN, 32
104	clrldi	rN, rN, 61
105	beq	L(dPs4)
106	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
107	bgt	cr1, L(dPs3)
108	beq	cr1, L(dPs2)
109
110/* Remainder is 8 */
111	.align	3
112L(dsP1):
113	sld	rWORD5, rWORD1, rWORD6
114	sld	rWORD6, rWORD2, rWORD6
115	cmpld	cr5, rWORD5, rWORD6
116	blt	cr7, L(dP1x)
117/* Do something useful in this cycle since we have to branch anyway.  */
118#ifdef __LITTLE_ENDIAN__
119	ldbrx	rWORD1, 0, rSTR1
120	ldbrx	rWORD2, 0, rSTR2
121	addi	rSTR1, rSTR1, 8
122	addi	rSTR2, rSTR2, 8
123#else
124	ld	rWORD1, 8(rSTR1)
125	ld	rWORD2, 8(rSTR2)
126#endif
127	cmpld	cr7, rWORD1, rWORD2
128	b	L(dP1e)
129/* Remainder is 16 */
130	.align	4
131L(dPs2):
132	sld	rWORD5, rWORD1, rWORD6
133	sld	rWORD6, rWORD2, rWORD6
134	cmpld	cr6, rWORD5, rWORD6
135	blt	cr7, L(dP2x)
136/* Do something useful in this cycle since we have to branch anyway.  */
137#ifdef __LITTLE_ENDIAN__
138	ldbrx	rWORD7, 0, rSTR1
139	ldbrx	rWORD8, 0, rSTR2
140	addi	rSTR1, rSTR1, 8
141	addi	rSTR2, rSTR2, 8
142#else
143	ld	rWORD7, 8(rSTR1)
144	ld	rWORD8, 8(rSTR2)
145#endif
146	cmpld	cr5, rWORD7, rWORD8
147	b	L(dP2e)
148/* Remainder is 24 */
149	.align	4
150L(dPs3):
151	sld	rWORD3, rWORD1, rWORD6
152	sld	rWORD4, rWORD2, rWORD6
153	cmpld	cr1, rWORD3, rWORD4
154	b	L(dP3e)
155/* Count is a multiple of 32, remainder is 0 */
156	.align	4
157L(dPs4):
158	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
159	sld	rWORD1, rWORD1, rWORD6
160	sld	rWORD2, rWORD2, rWORD6
161	cmpld	cr7, rWORD1, rWORD2
162	b	L(dP4e)
163
164/* At this point we know both strings are double word aligned and the
165   compare length is at least 8 bytes.  */
166	.align	4
167L(DWaligned):
168	andi.	r12, rN, 24	/* Get the DW remainder */
169	srdi	r0, rN, 5	/* Divide by 32 */
170	cmpldi	cr1, r12, 16
171	cmpldi	cr7, rN, 32
172	clrldi	rN, rN, 61
173	beq	L(dP4)
174	bgt	cr1, L(dP3)
175	beq	cr1, L(dP2)
176
177/* Remainder is 8 */
178	.align	4
179L(dP1):
180	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
181/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
182   (8-15 byte compare), we want to use only volatile registers.  This
183   means we can avoid restoring non-volatile registers since we did not
184   change any on the early exit path.  The key here is the non-early
185   exit path only cares about the condition code (cr5), not about which
186   register pair was used.  */
187#ifdef __LITTLE_ENDIAN__
188	ldbrx	rWORD5, 0, rSTR1
189	ldbrx	rWORD6, 0, rSTR2
190	addi	rSTR1, rSTR1, 8
191	addi	rSTR2, rSTR2, 8
192#else
193	ld	rWORD5, 0(rSTR1)
194	ld	rWORD6, 0(rSTR2)
195#endif
196	cmpld	cr5, rWORD5, rWORD6
197	blt	cr7, L(dP1x)
198#ifdef __LITTLE_ENDIAN__
199	ldbrx	rWORD1, 0, rSTR1
200	ldbrx	rWORD2, 0, rSTR2
201	addi	rSTR1, rSTR1, 8
202	addi	rSTR2, rSTR2, 8
203#else
204	ld	rWORD1, 8(rSTR1)
205	ld	rWORD2, 8(rSTR2)
206#endif
207	cmpld	cr7, rWORD1, rWORD2
208L(dP1e):
209#ifdef __LITTLE_ENDIAN__
210	ldbrx	rWORD3, 0, rSTR1
211	ldbrx	rWORD4, 0, rSTR2
212	addi	rSTR1, rSTR1, 8
213	addi	rSTR2, rSTR2, 8
214#else
215	ld	rWORD3, 16(rSTR1)
216	ld	rWORD4, 16(rSTR2)
217#endif
218	cmpld	cr1, rWORD3, rWORD4
219#ifdef __LITTLE_ENDIAN__
220	ldbrx	rWORD5, 0, rSTR1
221	ldbrx	rWORD6, 0, rSTR2
222	addi	rSTR1, rSTR1, 8
223	addi	rSTR2, rSTR2, 8
224#else
225	ld	rWORD5, 24(rSTR1)
226	ld	rWORD6, 24(rSTR2)
227#endif
228	cmpld	cr6, rWORD5, rWORD6
229	bne	cr5, L(dLcr5x)
230	bne	cr7, L(dLcr7x)
231
232#ifdef __LITTLE_ENDIAN__
233	ldbrx	rWORD7, 0, rSTR1
234	ldbrx	rWORD8, 0, rSTR2
235	addi	rSTR1, rSTR1, 8
236	addi	rSTR2, rSTR2, 8
237#else
238	ldu	rWORD7, 32(rSTR1)
239	ldu	rWORD8, 32(rSTR2)
240#endif
241	bne	cr1, L(dLcr1)
242	cmpld	cr5, rWORD7, rWORD8
243	bdnz	L(dLoop)
244	bne	cr6, L(dLcr6)
245	ld	rWORD8, -8(r1)
246	ld	rWORD7, -16(r1)
247	.align	3
248L(dP1x):
249	sldi.	r12, rN, 3
250	bne	cr5, L(dLcr5x)
251	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
252	bne	L(d00)
253	li	rRTN, 0
254	blr
255
256/* Remainder is 16 */
257	.align	4
258L(dP2):
259	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
260#ifdef __LITTLE_ENDIAN__
261	ldbrx	rWORD5, 0, rSTR1
262	ldbrx	rWORD6, 0, rSTR2
263	addi	rSTR1, rSTR1, 8
264	addi	rSTR2, rSTR2, 8
265#else
266	ld	rWORD5, 0(rSTR1)
267	ld	rWORD6, 0(rSTR2)
268#endif
269	cmpld	cr6, rWORD5, rWORD6
270	blt	cr7, L(dP2x)
271#ifdef __LITTLE_ENDIAN__
272	ldbrx	rWORD7, 0, rSTR1
273	ldbrx	rWORD8, 0, rSTR2
274	addi	rSTR1, rSTR1, 8
275	addi	rSTR2, rSTR2, 8
276#else
277	ld	rWORD7, 8(rSTR1)
278	ld	rWORD8, 8(rSTR2)
279#endif
280	cmpld	cr5, rWORD7, rWORD8
281L(dP2e):
282#ifdef __LITTLE_ENDIAN__
283	ldbrx	rWORD1, 0, rSTR1
284	ldbrx	rWORD2, 0, rSTR2
285	addi	rSTR1, rSTR1, 8
286	addi	rSTR2, rSTR2, 8
287#else
288	ld	rWORD1, 16(rSTR1)
289	ld	rWORD2, 16(rSTR2)
290#endif
291	cmpld	cr7, rWORD1, rWORD2
292#ifdef __LITTLE_ENDIAN__
293	ldbrx	rWORD3, 0, rSTR1
294	ldbrx	rWORD4, 0, rSTR2
295	addi	rSTR1, rSTR1, 8
296	addi	rSTR2, rSTR2, 8
297#else
298	ld	rWORD3, 24(rSTR1)
299	ld	rWORD4, 24(rSTR2)
300#endif
301	cmpld	cr1, rWORD3, rWORD4
302#ifndef __LITTLE_ENDIAN__
303	addi	rSTR1, rSTR1, 8
304	addi	rSTR2, rSTR2, 8
305#endif
306	bne	cr6, L(dLcr6)
307	bne	cr5, L(dLcr5)
308	b	L(dLoop2)
309/* Again we are on a early exit path (16-23 byte compare), we want to
310   only use volatile registers and avoid restoring non-volatile
311   registers.  */
312	.align	4
313L(dP2x):
314#ifdef __LITTLE_ENDIAN__
315	ldbrx	rWORD3, 0, rSTR1
316	ldbrx	rWORD4, 0, rSTR2
317	addi	rSTR1, rSTR1, 8
318	addi	rSTR2, rSTR2, 8
319#else
320	ld	rWORD3, 8(rSTR1)
321	ld	rWORD4, 8(rSTR2)
322#endif
323	cmpld	cr1, rWORD3, rWORD4
324	sldi.	r12, rN, 3
325	bne	cr6, L(dLcr6x)
326#ifndef __LITTLE_ENDIAN__
327	addi	rSTR1, rSTR1, 8
328	addi	rSTR2, rSTR2, 8
329#endif
330	bne	cr1, L(dLcr1x)
331	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
332	bne	L(d00)
333	li	rRTN, 0
334	blr
335
336/* Remainder is 24 */
337	.align	4
338L(dP3):
339	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
340#ifdef __LITTLE_ENDIAN__
341	ldbrx	rWORD3, 0, rSTR1
342	ldbrx	rWORD4, 0, rSTR2
343	addi	rSTR1, rSTR1, 8
344	addi	rSTR2, rSTR2, 8
345#else
346	ld	rWORD3, 0(rSTR1)
347	ld	rWORD4, 0(rSTR2)
348#endif
349	cmpld	cr1, rWORD3, rWORD4
350L(dP3e):
351#ifdef __LITTLE_ENDIAN__
352	ldbrx	rWORD5, 0, rSTR1
353	ldbrx	rWORD6, 0, rSTR2
354	addi	rSTR1, rSTR1, 8
355	addi	rSTR2, rSTR2, 8
356#else
357	ld	rWORD5, 8(rSTR1)
358	ld	rWORD6, 8(rSTR2)
359#endif
360	cmpld	cr6, rWORD5, rWORD6
361	blt	cr7, L(dP3x)
362#ifdef __LITTLE_ENDIAN__
363	ldbrx	rWORD7, 0, rSTR1
364	ldbrx	rWORD8, 0, rSTR2
365	addi	rSTR1, rSTR1, 8
366	addi	rSTR2, rSTR2, 8
367#else
368	ld	rWORD7, 16(rSTR1)
369	ld	rWORD8, 16(rSTR2)
370#endif
371	cmpld	cr5, rWORD7, rWORD8
372#ifdef __LITTLE_ENDIAN__
373	ldbrx	rWORD1, 0, rSTR1
374	ldbrx	rWORD2, 0, rSTR2
375	addi	rSTR1, rSTR1, 8
376	addi	rSTR2, rSTR2, 8
377#else
378	ld	rWORD1, 24(rSTR1)
379	ld	rWORD2, 24(rSTR2)
380#endif
381	cmpld	cr7, rWORD1, rWORD2
382#ifndef __LITTLE_ENDIAN__
383	addi	rSTR1, rSTR1, 16
384	addi	rSTR2, rSTR2, 16
385#endif
386	bne	cr1, L(dLcr1)
387	bne	cr6, L(dLcr6)
388	b	L(dLoop1)
389/* Again we are on a early exit path (24-31 byte compare), we want to
390   only use volatile registers and avoid restoring non-volatile
391   registers.  */
392	.align	4
393L(dP3x):
394#ifdef __LITTLE_ENDIAN__
395	ldbrx	rWORD1, 0, rSTR1
396	ldbrx	rWORD2, 0, rSTR2
397	addi	rSTR1, rSTR1, 8
398	addi	rSTR2, rSTR2, 8
399#else
400	ld	rWORD1, 16(rSTR1)
401	ld	rWORD2, 16(rSTR2)
402#endif
403	cmpld	cr7, rWORD1, rWORD2
404	sldi.	r12, rN, 3
405	bne	cr1, L(dLcr1x)
406#ifndef __LITTLE_ENDIAN__
407	addi	rSTR1, rSTR1, 16
408	addi	rSTR2, rSTR2, 16
409#endif
410	bne	cr6, L(dLcr6x)
411	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
412	bne	cr7, L(dLcr7x)
413	bne	L(d00)
414	li	rRTN, 0
415	blr
416
417/* Count is a multiple of 32, remainder is 0 */
418	.align	4
419L(dP4):
420	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
421#ifdef __LITTLE_ENDIAN__
422	ldbrx	rWORD1, 0, rSTR1
423	ldbrx	rWORD2, 0, rSTR2
424	addi	rSTR1, rSTR1, 8
425	addi	rSTR2, rSTR2, 8
426#else
427	ld	rWORD1, 0(rSTR1)
428	ld	rWORD2, 0(rSTR2)
429#endif
430	cmpld	cr7, rWORD1, rWORD2
431L(dP4e):
432#ifdef __LITTLE_ENDIAN__
433	ldbrx	rWORD3, 0, rSTR1
434	ldbrx	rWORD4, 0, rSTR2
435	addi	rSTR1, rSTR1, 8
436	addi	rSTR2, rSTR2, 8
437#else
438	ld	rWORD3, 8(rSTR1)
439	ld	rWORD4, 8(rSTR2)
440#endif
441	cmpld	cr1, rWORD3, rWORD4
442#ifdef __LITTLE_ENDIAN__
443	ldbrx	rWORD5, 0, rSTR1
444	ldbrx	rWORD6, 0, rSTR2
445	addi	rSTR1, rSTR1, 8
446	addi	rSTR2, rSTR2, 8
447#else
448	ld	rWORD5, 16(rSTR1)
449	ld	rWORD6, 16(rSTR2)
450#endif
451	cmpld	cr6, rWORD5, rWORD6
452#ifdef __LITTLE_ENDIAN__
453	ldbrx	rWORD7, 0, rSTR1
454	ldbrx	rWORD8, 0, rSTR2
455	addi	rSTR1, rSTR1, 8
456	addi	rSTR2, rSTR2, 8
457#else
458	ldu	rWORD7, 24(rSTR1)
459	ldu	rWORD8, 24(rSTR2)
460#endif
461	cmpld	cr5, rWORD7, rWORD8
462	bne	cr7, L(dLcr7)
463	bne	cr1, L(dLcr1)
464	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
465/* This is the primary loop */
466	.align	4
467L(dLoop):
468#ifdef __LITTLE_ENDIAN__
469	ldbrx	rWORD1, 0, rSTR1
470	ldbrx	rWORD2, 0, rSTR2
471	addi	rSTR1, rSTR1, 8
472	addi	rSTR2, rSTR2, 8
473#else
474	ld	rWORD1, 8(rSTR1)
475	ld	rWORD2, 8(rSTR2)
476#endif
477	cmpld	cr1, rWORD3, rWORD4
478	bne	cr6, L(dLcr6)
479L(dLoop1):
480#ifdef __LITTLE_ENDIAN__
481	ldbrx	rWORD3, 0, rSTR1
482	ldbrx	rWORD4, 0, rSTR2
483	addi	rSTR1, rSTR1, 8
484	addi	rSTR2, rSTR2, 8
485#else
486	ld	rWORD3, 16(rSTR1)
487	ld	rWORD4, 16(rSTR2)
488#endif
489	cmpld	cr6, rWORD5, rWORD6
490	bne	cr5, L(dLcr5)
491L(dLoop2):
492#ifdef __LITTLE_ENDIAN__
493	ldbrx	rWORD5, 0, rSTR1
494	ldbrx	rWORD6, 0, rSTR2
495	addi	rSTR1, rSTR1, 8
496	addi	rSTR2, rSTR2, 8
497#else
498	ld	rWORD5, 24(rSTR1)
499	ld	rWORD6, 24(rSTR2)
500#endif
501	cmpld	cr5, rWORD7, rWORD8
502	bne	cr7, L(dLcr7)
503L(dLoop3):
504#ifdef __LITTLE_ENDIAN__
505	ldbrx	rWORD7, 0, rSTR1
506	ldbrx	rWORD8, 0, rSTR2
507	addi	rSTR1, rSTR1, 8
508	addi	rSTR2, rSTR2, 8
509#else
510	ldu	rWORD7, 32(rSTR1)
511	ldu	rWORD8, 32(rSTR2)
512#endif
513	bne-	cr1, L(dLcr1)
514	cmpld	cr7, rWORD1, rWORD2
515	bdnz+	L(dLoop)
516
517L(dL4):
518	cmpld	cr1, rWORD3, rWORD4
519	bne	cr6, L(dLcr6)
520	cmpld	cr6, rWORD5, rWORD6
521	bne	cr5, L(dLcr5)
522	cmpld	cr5, rWORD7, rWORD8
523L(d44):
524	bne	cr7, L(dLcr7)
525L(d34):
526	bne	cr1, L(dLcr1)
527L(d24):
528	bne	cr6, L(dLcr6)
529L(d14):
530	sldi.	r12, rN, 3
531	bne	cr5, L(dLcr5)
532L(d04):
533	ld	rWORD8, -8(r1)
534	ld	rWORD7, -16(r1)
535	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
536	beq	L(zeroLength)
537/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
538   we are aligned it is safe to load the whole double word, and use
539   shift right double to eliminate bits beyond the compare length.  */
540L(d00):
541#ifdef __LITTLE_ENDIAN__
542	ldbrx	rWORD1, 0, rSTR1
543	ldbrx	rWORD2, 0, rSTR2
544	addi	rSTR1, rSTR1, 8
545	addi	rSTR2, rSTR2, 8
546#else
547	ld	rWORD1, 8(rSTR1)
548	ld	rWORD2, 8(rSTR2)
549#endif
550	srd	rWORD1, rWORD1, rN
551	srd	rWORD2, rWORD2, rN
552	cmpld	cr7, rWORD1, rWORD2
553	bne	cr7, L(dLcr7x)
554	li	rRTN, 0
555	blr
556
557	.align	4
558L(dLcr7):
559	ld	rWORD8, -8(r1)
560	ld	rWORD7, -16(r1)
561L(dLcr7x):
562	li	rRTN, 1
563	bgtlr	cr7
564	li	rRTN, -1
565	blr
566	.align	4
567L(dLcr1):
568	ld	rWORD8, -8(r1)
569	ld	rWORD7, -16(r1)
570L(dLcr1x):
571	li	rRTN, 1
572	bgtlr	cr1
573	li	rRTN, -1
574	blr
575	.align	4
576L(dLcr6):
577	ld	rWORD8, -8(r1)
578	ld	rWORD7, -16(r1)
579L(dLcr6x):
580	li	rRTN, 1
581	bgtlr	cr6
582	li	rRTN, -1
583	blr
584	.align	4
585L(dLcr5):
586	ld	rWORD8, -8(r1)
587	ld	rWORD7, -16(r1)
588L(dLcr5x):
589	li	rRTN, 1
590	bgtlr	cr5
591	li	rRTN, -1
592	blr
593
594	.align	4
595L(bytealigned):
596	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
597#if 0
598/* Huh?  We've already branched on cr6!  */
599	beq-	cr6, L(zeroLength)
600#endif
601
602/* We need to prime this loop.  This loop is swing modulo scheduled
603   to avoid pipe delays.  The dependent instruction latencies (load to
604   compare to conditional branch) is 2 to 3 cycles.  In this loop each
605   dispatch group ends in a branch and takes 1 cycle.  Effectively
606   the first iteration of the loop only serves to load operands and
607   branches based on compares are delayed until the next loop.
608
609   So we must precondition some registers and condition codes so that
610   we don't exit the loop early on the first iteration.  */
611
612	lbz	rWORD1, 0(rSTR1)
613	lbz	rWORD2, 0(rSTR2)
614	bdz-	L(b11)
615	cmpld	cr7, rWORD1, rWORD2
616	lbz	rWORD3, 1(rSTR1)
617	lbz	rWORD4, 1(rSTR2)
618	bdz-	L(b12)
619	cmpld	cr1, rWORD3, rWORD4
620	lbzu	rWORD5, 2(rSTR1)
621	lbzu	rWORD6, 2(rSTR2)
622	bdz-	L(b13)
623	.align	4
624L(bLoop):
625	lbzu	rWORD1, 1(rSTR1)
626	lbzu	rWORD2, 1(rSTR2)
627	bne-	cr7, L(bLcr7)
628
629	cmpld	cr6, rWORD5, rWORD6
630	bdz-	L(b3i)
631
632	lbzu	rWORD3, 1(rSTR1)
633	lbzu	rWORD4, 1(rSTR2)
634	bne-	cr1, L(bLcr1)
635
636	cmpld	cr7, rWORD1, rWORD2
637	bdz-	L(b2i)
638
639	lbzu	rWORD5, 1(rSTR1)
640	lbzu	rWORD6, 1(rSTR2)
641	bne-	cr6, L(bLcr6)
642
643	cmpld	cr1, rWORD3, rWORD4
644	bdnz+	L(bLoop)
645
646/* We speculatively loading bytes before we have tested the previous
647   bytes.  But we must avoid overrunning the length (in the ctr) to
648   prevent these speculative loads from causing a segfault.  In this
649   case the loop will exit early (before the all pending bytes are
650   tested.  In this case we must complete the pending operations
651   before returning.  */
652L(b1i):
653	bne-	cr7, L(bLcr7)
654	bne-	cr1, L(bLcr1)
655	b	L(bx56)
656	.align	4
657L(b2i):
658	bne-	cr6, L(bLcr6)
659	bne-	cr7, L(bLcr7)
660	b	L(bx34)
661	.align	4
662L(b3i):
663	bne-	cr1, L(bLcr1)
664	bne-	cr6, L(bLcr6)
665	b	L(bx12)
666	.align	4
667L(bLcr7):
668	li	rRTN, 1
669	bgtlr	cr7
670	li	rRTN, -1
671	blr
672L(bLcr1):
673	li	rRTN, 1
674	bgtlr	cr1
675	li	rRTN, -1
676	blr
677L(bLcr6):
678	li	rRTN, 1
679	bgtlr	cr6
680	li	rRTN, -1
681	blr
682
683L(b13):
684	bne-	cr7, L(bx12)
685	bne-	cr1, L(bx34)
686L(bx56):
687	sub	rRTN, rWORD5, rWORD6
688	blr
689	nop
690L(b12):
691	bne-	cr7, L(bx12)
692L(bx34):
693	sub	rRTN, rWORD3, rWORD4
694	blr
695L(b11):
696L(bx12):
697	sub	rRTN, rWORD1, rWORD2
698	blr
699	.align	4
700L(zeroLength):
701	li	rRTN, 0
702	blr
703
704	.align	4
705/* At this point we know the strings have different alignment and the
706   compare length is at least 8 bytes.  r12 contains the low order
707   3 bits of rSTR1 and cr5 contains the result of the logical compare
708   of r12 to 0.  If r12 == 0 then rStr1 is double word
709   aligned and can perform the DWunaligned loop.
710
711   Otherwise we know that rSTR1 is not already DW aligned yet.
712   So we can force the string addresses to the next lower DW
713   boundary and special case this first DW using shift left to
714   eliminate bits preceding the first byte.  Since we want to join the
715   normal (DWaligned) compare loop, starting at the second double word,
716   we need to adjust the length (rN) and special case the loop
717   versioning for the first DW. This ensures that the loop count is
718   correct and the first DW (shifted) is in the expected resister pair.  */
719#define rSHL		r29	/* Unaligned shift left count.  */
720#define rSHR		r28	/* Unaligned shift right count.  */
721#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
722#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
723#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
724#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
725L(unaligned):
726	std	rSHL, -24(r1)
727	cfi_offset(rSHL, -24)
728	clrldi	rSHL, rSTR2, 61
729	beq-	cr6, L(duzeroLength)
730	std	rSHR, -32(r1)
731	cfi_offset(rSHR, -32)
732	beq	cr5, L(DWunaligned)
733	std	rWORD8_SHIFT, -40(r1)
734	cfi_offset(rWORD8_SHIFT, -40)
735/* Adjust the logical start of rSTR2 to compensate for the extra bits
736   in the 1st rSTR1 DW.  */
737	sub	rWORD8_SHIFT, rSTR2, r12
738/* But do not attempt to address the DW before that DW that contains
739   the actual start of rSTR2.  */
740	clrrdi	rSTR2, rSTR2, 3
741	std	rWORD2_SHIFT, -48(r1)
742/* Compute the left/right shift counts for the unaligned rSTR2,
743   compensating for the logical (DW aligned) start of rSTR1.  */
744	clrldi	rSHL, rWORD8_SHIFT, 61
745	clrrdi	rSTR1, rSTR1, 3
746	std	rWORD4_SHIFT, -56(r1)
747	sldi	rSHL, rSHL, 3
748	cmpld	cr5, rWORD8_SHIFT, rSTR2
749	add	rN, rN, r12
750	sldi	rWORD6, r12, 3
751	std	rWORD6_SHIFT, -64(r1)
752	cfi_offset(rWORD2_SHIFT, -48)
753	cfi_offset(rWORD4_SHIFT, -56)
754	cfi_offset(rWORD6_SHIFT, -64)
755	subfic	rSHR, rSHL, 64
756	srdi	r0, rN, 5	/* Divide by 32 */
757	andi.	r12, rN, 24	/* Get the DW remainder */
758/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
759   this special case those bits may be discarded anyway.  Also we
760   must avoid loading a DW where none of the bits are part of rSTR2 as
761   this may cross a page boundary and cause a page fault.  */
762	li	rWORD8, 0
763	blt	cr5, L(dus0)
764#ifdef __LITTLE_ENDIAN__
765	ldbrx	rWORD8, 0, rSTR2
766	addi	rSTR2, rSTR2, 8
767#else
768	ld	rWORD8, 0(rSTR2)
769	addi	rSTR2, rSTR2, 8
770#endif
771	sld	rWORD8, rWORD8, rSHL
772
773L(dus0):
774#ifdef __LITTLE_ENDIAN__
775	ldbrx	rWORD1, 0, rSTR1
776	ldbrx	rWORD2, 0, rSTR2
777	addi	rSTR1, rSTR1, 8
778	addi	rSTR2, rSTR2, 8
779#else
780	ld	rWORD1, 0(rSTR1)
781	ld	rWORD2, 0(rSTR2)
782#endif
783	cmpldi	cr1, r12, 16
784	cmpldi	cr7, rN, 32
785	srd	r12, rWORD2, rSHR
786	clrldi	rN, rN, 61
787	beq	L(duPs4)
788	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
789	or	rWORD8, r12, rWORD8
790	bgt	cr1, L(duPs3)
791	beq	cr1, L(duPs2)
792
793/* Remainder is 8 */
794	.align	4
795L(dusP1):
796	sld	rWORD8_SHIFT, rWORD2, rSHL
797	sld	rWORD7, rWORD1, rWORD6
798	sld	rWORD8, rWORD8, rWORD6
799	bge	cr7, L(duP1e)
800/* At this point we exit early with the first double word compare
801   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
802   how we handle the remaining bytes.  */
803	cmpld	cr5, rWORD7, rWORD8
804	sldi.	rN, rN, 3
805	bne	cr5, L(duLcr5)
806	cmpld	cr7, rN, rSHR
807	beq	L(duZeroReturn)
808	li	r0, 0
809	ble	cr7, L(dutrim)
810#ifdef __LITTLE_ENDIAN__
811	ldbrx	rWORD2, 0, rSTR2
812	addi	rSTR2, rSTR2, 8
813#else
814	ld	rWORD2, 8(rSTR2)
815#endif
816	srd	r0, rWORD2, rSHR
817	b	L(dutrim)
818/* Remainder is 16 */
819	.align	4
820L(duPs2):
821	sld	rWORD6_SHIFT, rWORD2, rSHL
822	sld	rWORD5, rWORD1, rWORD6
823	sld	rWORD6, rWORD8, rWORD6
824	b	L(duP2e)
825/* Remainder is 24 */
826	.align	4
827L(duPs3):
828	sld	rWORD4_SHIFT, rWORD2, rSHL
829	sld	rWORD3, rWORD1, rWORD6
830	sld	rWORD4, rWORD8, rWORD6
831	b	L(duP3e)
832/* Count is a multiple of 32, remainder is 0 */
833	.align	4
834L(duPs4):
835	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
836	or	rWORD8, r12, rWORD8
837	sld	rWORD2_SHIFT, rWORD2, rSHL
838	sld	rWORD1, rWORD1, rWORD6
839	sld	rWORD2, rWORD8, rWORD6
840	b	L(duP4e)
841
842/* At this point we know rSTR1 is double word aligned and the
843   compare length is at least 8 bytes.  */
844	.align	4
845L(DWunaligned):
846	std	rWORD8_SHIFT, -40(r1)
847	clrrdi	rSTR2, rSTR2, 3
848	std	rWORD2_SHIFT, -48(r1)
849	srdi	r0, rN, 5	/* Divide by 32 */
850	std	rWORD4_SHIFT, -56(r1)
851	andi.	r12, rN, 24	/* Get the DW remainder */
852	std	rWORD6_SHIFT, -64(r1)
853	cfi_offset(rWORD8_SHIFT, -40)
854	cfi_offset(rWORD2_SHIFT, -48)
855	cfi_offset(rWORD4_SHIFT, -56)
856	cfi_offset(rWORD6_SHIFT, -64)
857	sldi	rSHL, rSHL, 3
858#ifdef __LITTLE_ENDIAN__
859	ldbrx	rWORD6, 0, rSTR2
860	addi	rSTR2, rSTR2, 8
861	ldbrx	rWORD8, 0, rSTR2
862	addi	rSTR2, rSTR2, 8
863#else
864	ld	rWORD6, 0(rSTR2)
865	ldu	rWORD8, 8(rSTR2)
866#endif
867	cmpldi	cr1, r12, 16
868	cmpldi	cr7, rN, 32
869	clrldi	rN, rN, 61
870	subfic	rSHR, rSHL, 64
871	sld	rWORD6_SHIFT, rWORD6, rSHL
872	beq	L(duP4)
873	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
874	bgt	cr1, L(duP3)
875	beq	cr1, L(duP2)
876
877/* Remainder is 8 */
878	.align	4
879L(duP1):
880	srd	r12, rWORD8, rSHR
881#ifdef __LITTLE_ENDIAN__
882	ldbrx	rWORD7, 0, rSTR1
883	addi	rSTR1, rSTR1, 8
884#else
885	ld	rWORD7, 0(rSTR1)
886#endif
887	sld	rWORD8_SHIFT, rWORD8, rSHL
888	or	rWORD8, r12, rWORD6_SHIFT
889	blt	cr7, L(duP1x)
890L(duP1e):
891#ifdef __LITTLE_ENDIAN__
892	ldbrx	rWORD1, 0, rSTR1
893	ldbrx	rWORD2, 0, rSTR2
894	addi	rSTR1, rSTR1, 8
895	addi	rSTR2, rSTR2, 8
896#else
897	ld	rWORD1, 8(rSTR1)
898	ld	rWORD2, 8(rSTR2)
899#endif
900	cmpld	cr5, rWORD7, rWORD8
901	srd	r0, rWORD2, rSHR
902	sld	rWORD2_SHIFT, rWORD2, rSHL
903	or	rWORD2, r0, rWORD8_SHIFT
904#ifdef __LITTLE_ENDIAN__
905	ldbrx	rWORD3, 0, rSTR1
906	ldbrx	rWORD4, 0, rSTR2
907	addi	rSTR1, rSTR1, 8
908	addi	rSTR2, rSTR2, 8
909#else
910	ld	rWORD3, 16(rSTR1)
911	ld	rWORD4, 16(rSTR2)
912#endif
913	cmpld	cr7, rWORD1, rWORD2
914	srd	r12, rWORD4, rSHR
915	sld	rWORD4_SHIFT, rWORD4, rSHL
916	bne	cr5, L(duLcr5)
917	or	rWORD4, r12, rWORD2_SHIFT
918#ifdef __LITTLE_ENDIAN__
919	ldbrx	rWORD5, 0, rSTR1
920	ldbrx	rWORD6, 0, rSTR2
921	addi	rSTR1, rSTR1, 8
922	addi	rSTR2, rSTR2, 8
923#else
924	ld	rWORD5, 24(rSTR1)
925	ld	rWORD6, 24(rSTR2)
926#endif
927	cmpld	cr1, rWORD3, rWORD4
928	srd	r0, rWORD6, rSHR
929	sld	rWORD6_SHIFT, rWORD6, rSHL
930	bne	cr7, L(duLcr7)
931	or	rWORD6, r0, rWORD4_SHIFT
932	cmpld	cr6, rWORD5, rWORD6
933	b	L(duLoop3)
934	.align	4
935/* At this point we exit early with the first double word compare
936   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
937   how we handle the remaining bytes.  */
938L(duP1x):
939	cmpld	cr5, rWORD7, rWORD8
940	sldi.	rN, rN, 3
941	bne	cr5, L(duLcr5)
942	cmpld	cr7, rN, rSHR
943	beq	L(duZeroReturn)
944	li	r0, 0
945	ble	cr7, L(dutrim)
946#ifdef __LITTLE_ENDIAN__
947	ldbrx	rWORD2, 0, rSTR2
948	addi	rSTR2, rSTR2, 8
949#else
950	ld	rWORD2, 8(rSTR2)
951#endif
952	srd	r0, rWORD2, rSHR
953	b	L(dutrim)
954/* Remainder is 16 */
955	.align	4
956L(duP2):
957	srd	r0, rWORD8, rSHR
958#ifdef __LITTLE_ENDIAN__
959	ldbrx	rWORD5, 0, rSTR1
960	addi	rSTR1, rSTR1, 8
961#else
962	ld	rWORD5, 0(rSTR1)
963#endif
964	or	rWORD6, r0, rWORD6_SHIFT
965	sld	rWORD6_SHIFT, rWORD8, rSHL
966L(duP2e):
967#ifdef __LITTLE_ENDIAN__
968	ldbrx	rWORD7, 0, rSTR1
969	ldbrx	rWORD8, 0, rSTR2
970	addi	rSTR1, rSTR1, 8
971	addi	rSTR2, rSTR2, 8
972#else
973	ld	rWORD7, 8(rSTR1)
974	ld	rWORD8, 8(rSTR2)
975#endif
976	cmpld	cr6, rWORD5, rWORD6
977	srd	r12, rWORD8, rSHR
978	sld	rWORD8_SHIFT, rWORD8, rSHL
979	or	rWORD8, r12, rWORD6_SHIFT
980	blt	cr7, L(duP2x)
981#ifdef __LITTLE_ENDIAN__
982	ldbrx	rWORD1, 0, rSTR1
983	ldbrx	rWORD2, 0, rSTR2
984	addi	rSTR1, rSTR1, 8
985	addi	rSTR2, rSTR2, 8
986#else
987	ld	rWORD1, 16(rSTR1)
988	ld	rWORD2, 16(rSTR2)
989#endif
990	cmpld	cr5, rWORD7, rWORD8
991	bne	cr6, L(duLcr6)
992	srd	r0, rWORD2, rSHR
993	sld	rWORD2_SHIFT, rWORD2, rSHL
994	or	rWORD2, r0, rWORD8_SHIFT
995#ifdef __LITTLE_ENDIAN__
996	ldbrx	rWORD3, 0, rSTR1
997	ldbrx	rWORD4, 0, rSTR2
998	addi	rSTR1, rSTR1, 8
999	addi	rSTR2, rSTR2, 8
1000#else
1001	ld	rWORD3, 24(rSTR1)
1002	ld	rWORD4, 24(rSTR2)
1003#endif
1004	cmpld	cr7, rWORD1, rWORD2
1005	bne	cr5, L(duLcr5)
1006	srd	r12, rWORD4, rSHR
1007	sld	rWORD4_SHIFT, rWORD4, rSHL
1008	or	rWORD4, r12, rWORD2_SHIFT
1009#ifndef __LITTLE_ENDIAN__
1010	addi	rSTR1, rSTR1, 8
1011	addi	rSTR2, rSTR2, 8
1012#endif
1013	cmpld	cr1, rWORD3, rWORD4
1014	b	L(duLoop2)
1015	.align	4
1016L(duP2x):
1017	cmpld	cr5, rWORD7, rWORD8
1018#ifndef __LITTLE_ENDIAN__
1019	addi	rSTR1, rSTR1, 8
1020	addi	rSTR2, rSTR2, 8
1021#endif
1022	bne	cr6, L(duLcr6)
1023	sldi.	rN, rN, 3
1024	bne	cr5, L(duLcr5)
1025	cmpld	cr7, rN, rSHR
1026	beq	L(duZeroReturn)
1027	li	r0, 0
1028	ble	cr7, L(dutrim)
1029#ifdef __LITTLE_ENDIAN__
1030	ldbrx	rWORD2, 0, rSTR2
1031	addi	rSTR2, rSTR2, 8
1032#else
1033	ld	rWORD2, 8(rSTR2)
1034#endif
1035	srd	r0, rWORD2, rSHR
1036	b	L(dutrim)
1037
1038/* Remainder is 24 */
1039	.align	4
1040L(duP3):
1041	srd	r12, rWORD8, rSHR
1042#ifdef __LITTLE_ENDIAN__
1043	ldbrx	rWORD3, 0, rSTR1
1044	addi	rSTR1, rSTR1, 8
1045#else
1046	ld	rWORD3, 0(rSTR1)
1047#endif
1048	sld	rWORD4_SHIFT, rWORD8, rSHL
1049	or	rWORD4, r12, rWORD6_SHIFT
1050L(duP3e):
1051#ifdef __LITTLE_ENDIAN__
1052	ldbrx	rWORD5, 0, rSTR1
1053	ldbrx	rWORD6, 0, rSTR2
1054	addi	rSTR1, rSTR1, 8
1055	addi	rSTR2, rSTR2, 8
1056#else
1057	ld	rWORD5, 8(rSTR1)
1058	ld	rWORD6, 8(rSTR2)
1059#endif
1060	cmpld	cr1, rWORD3, rWORD4
1061	srd	r0, rWORD6, rSHR
1062	sld	rWORD6_SHIFT, rWORD6, rSHL
1063	or	rWORD6, r0, rWORD4_SHIFT
1064#ifdef __LITTLE_ENDIAN__
1065	ldbrx	rWORD7, 0, rSTR1
1066	ldbrx	rWORD8, 0, rSTR2
1067	addi	rSTR1, rSTR1, 8
1068	addi	rSTR2, rSTR2, 8
1069#else
1070	ld	rWORD7, 16(rSTR1)
1071	ld	rWORD8, 16(rSTR2)
1072#endif
1073	cmpld	cr6, rWORD5, rWORD6
1074	bne	cr1, L(duLcr1)
1075	srd	r12, rWORD8, rSHR
1076	sld	rWORD8_SHIFT, rWORD8, rSHL
1077	or	rWORD8, r12, rWORD6_SHIFT
1078	blt	cr7, L(duP3x)
1079#ifdef __LITTLE_ENDIAN__
1080	ldbrx	rWORD1, 0, rSTR1
1081	ldbrx	rWORD2, 0, rSTR2
1082	addi	rSTR1, rSTR1, 8
1083	addi	rSTR2, rSTR2, 8
1084#else
1085	ld	rWORD1, 24(rSTR1)
1086	ld	rWORD2, 24(rSTR2)
1087#endif
1088	cmpld	cr5, rWORD7, rWORD8
1089	bne	cr6, L(duLcr6)
1090	srd	r0, rWORD2, rSHR
1091	sld	rWORD2_SHIFT, rWORD2, rSHL
1092	or	rWORD2, r0, rWORD8_SHIFT
1093#ifndef __LITTLE_ENDIAN__
1094	addi	rSTR1, rSTR1, 16
1095	addi	rSTR2, rSTR2, 16
1096#endif
1097	cmpld	cr7, rWORD1, rWORD2
1098	b	L(duLoop1)
1099	.align	4
1100L(duP3x):
1101#ifndef __LITTLE_ENDIAN__
1102	addi	rSTR1, rSTR1, 16
1103	addi	rSTR2, rSTR2, 16
1104#endif
1105#if 0
1106/* Huh?  We've already branched on cr1!  */
1107	bne	cr1, L(duLcr1)
1108#endif
1109	cmpld	cr5, rWORD7, rWORD8
1110	bne	cr6, L(duLcr6)
1111	sldi.	rN, rN, 3
1112	bne	cr5, L(duLcr5)
1113	cmpld	cr7, rN, rSHR
1114	beq	L(duZeroReturn)
1115	li	r0, 0
1116	ble	cr7, L(dutrim)
1117#ifdef __LITTLE_ENDIAN__
1118	ldbrx	rWORD2, 0, rSTR2
1119	addi	rSTR2, rSTR2, 8
1120#else
1121	ld	rWORD2, 8(rSTR2)
1122#endif
1123	srd	r0, rWORD2, rSHR
1124	b	L(dutrim)
1125
1126/* Count is a multiple of 32, remainder is 0 */
1127	.align	4
1128L(duP4):
1129	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
1130	srd	r0, rWORD8, rSHR
1131#ifdef __LITTLE_ENDIAN__
1132	ldbrx	rWORD1, 0, rSTR1
1133	addi	rSTR1, rSTR1, 8
1134#else
1135	ld	rWORD1, 0(rSTR1)
1136#endif
1137	sld	rWORD2_SHIFT, rWORD8, rSHL
1138	or	rWORD2, r0, rWORD6_SHIFT
1139L(duP4e):
1140#ifdef __LITTLE_ENDIAN__
1141	ldbrx	rWORD3, 0, rSTR1
1142	ldbrx	rWORD4, 0, rSTR2
1143	addi	rSTR1, rSTR1, 8
1144	addi	rSTR2, rSTR2, 8
1145#else
1146	ld	rWORD3, 8(rSTR1)
1147	ld	rWORD4, 8(rSTR2)
1148#endif
1149	cmpld	cr7, rWORD1, rWORD2
1150	srd	r12, rWORD4, rSHR
1151	sld	rWORD4_SHIFT, rWORD4, rSHL
1152	or	rWORD4, r12, rWORD2_SHIFT
1153#ifdef __LITTLE_ENDIAN__
1154	ldbrx	rWORD5, 0, rSTR1
1155	ldbrx	rWORD6, 0, rSTR2
1156	addi	rSTR1, rSTR1, 8
1157	addi	rSTR2, rSTR2, 8
1158#else
1159	ld	rWORD5, 16(rSTR1)
1160	ld	rWORD6, 16(rSTR2)
1161#endif
1162	cmpld	cr1, rWORD3, rWORD4
1163	bne	cr7, L(duLcr7)
1164	srd	r0, rWORD6, rSHR
1165	sld	rWORD6_SHIFT, rWORD6, rSHL
1166	or	rWORD6, r0, rWORD4_SHIFT
1167#ifdef __LITTLE_ENDIAN__
1168	ldbrx	rWORD7, 0, rSTR1
1169	ldbrx	rWORD8, 0, rSTR2
1170	addi	rSTR1, rSTR1, 8
1171	addi	rSTR2, rSTR2, 8
1172#else
1173	ldu	rWORD7, 24(rSTR1)
1174	ldu	rWORD8, 24(rSTR2)
1175#endif
1176	cmpld	cr6, rWORD5, rWORD6
1177	bne	cr1, L(duLcr1)
1178	srd	r12, rWORD8, rSHR
1179	sld	rWORD8_SHIFT, rWORD8, rSHL
1180	or	rWORD8, r12, rWORD6_SHIFT
1181	cmpld	cr5, rWORD7, rWORD8
1182	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
1183/* This is the primary loop */
1184	.align	4
1185L(duLoop):
1186#ifdef __LITTLE_ENDIAN__
1187	ldbrx	rWORD1, 0, rSTR1
1188	ldbrx	rWORD2, 0, rSTR2
1189	addi	rSTR1, rSTR1, 8
1190	addi	rSTR2, rSTR2, 8
1191#else
1192	ld	rWORD1, 8(rSTR1)
1193	ld	rWORD2, 8(rSTR2)
1194#endif
1195	cmpld	cr1, rWORD3, rWORD4
1196	bne	cr6, L(duLcr6)
1197	srd	r0, rWORD2, rSHR
1198	sld	rWORD2_SHIFT, rWORD2, rSHL
1199	or	rWORD2, r0, rWORD8_SHIFT
1200L(duLoop1):
1201#ifdef __LITTLE_ENDIAN__
1202	ldbrx	rWORD3, 0, rSTR1
1203	ldbrx	rWORD4, 0, rSTR2
1204	addi	rSTR1, rSTR1, 8
1205	addi	rSTR2, rSTR2, 8
1206#else
1207	ld	rWORD3, 16(rSTR1)
1208	ld	rWORD4, 16(rSTR2)
1209#endif
1210	cmpld	cr6, rWORD5, rWORD6
1211	bne	cr5, L(duLcr5)
1212	srd	r12, rWORD4, rSHR
1213	sld	rWORD4_SHIFT, rWORD4, rSHL
1214	or	rWORD4, r12, rWORD2_SHIFT
1215L(duLoop2):
1216#ifdef __LITTLE_ENDIAN__
1217	ldbrx	rWORD5, 0, rSTR1
1218	ldbrx	rWORD6, 0, rSTR2
1219	addi	rSTR1, rSTR1, 8
1220	addi	rSTR2, rSTR2, 8
1221#else
1222	ld	rWORD5, 24(rSTR1)
1223	ld	rWORD6, 24(rSTR2)
1224#endif
1225	cmpld	cr5, rWORD7, rWORD8
1226	bne	cr7, L(duLcr7)
1227	srd	r0, rWORD6, rSHR
1228	sld	rWORD6_SHIFT, rWORD6, rSHL
1229	or	rWORD6, r0, rWORD4_SHIFT
1230L(duLoop3):
1231#ifdef __LITTLE_ENDIAN__
1232	ldbrx	rWORD7, 0, rSTR1
1233	ldbrx	rWORD8, 0, rSTR2
1234	addi	rSTR1, rSTR1, 8
1235	addi	rSTR2, rSTR2, 8
1236#else
1237	ldu	rWORD7, 32(rSTR1)
1238	ldu	rWORD8, 32(rSTR2)
1239#endif
1240	cmpld	cr7, rWORD1, rWORD2
1241	bne-	cr1, L(duLcr1)
1242	srd	r12, rWORD8, rSHR
1243	sld	rWORD8_SHIFT, rWORD8, rSHL
1244	or	rWORD8, r12, rWORD6_SHIFT
1245	bdnz+	L(duLoop)
1246
1247L(duL4):
1248#if 0
1249/* Huh?  We've already branched on cr1!  */
1250	bne	cr1, L(duLcr1)
1251#endif
1252	cmpld	cr1, rWORD3, rWORD4
1253	bne	cr6, L(duLcr6)
1254	cmpld	cr6, rWORD5, rWORD6
1255	bne	cr5, L(duLcr5)
1256	cmpld	cr5, rWORD7, rWORD8
1257L(du44):
1258	bne	cr7, L(duLcr7)
1259L(du34):
1260	bne	cr1, L(duLcr1)
1261L(du24):
1262	bne	cr6, L(duLcr6)
1263L(du14):
1264	sldi.	rN, rN, 3
1265	bne	cr5, L(duLcr5)
1266/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
1267   shift right double to eliminate bits beyond the compare length.
1268
1269   However it may not be safe to load rWORD2 which may be beyond the
1270   string length. So we compare the bit length of the remainder to
1271   the right shift count (rSHR). If the bit count is less than or equal
1272   we do not need to load rWORD2 (all significant bits are already in
1273   rWORD8_SHIFT).  */
1274	cmpld	cr7, rN, rSHR
1275	beq	L(duZeroReturn)
1276	li	r0, 0
1277	ble	cr7, L(dutrim)
1278#ifdef __LITTLE_ENDIAN__
1279	ldbrx	rWORD2, 0, rSTR2
1280	addi	rSTR2, rSTR2, 8
1281#else
1282	ld	rWORD2, 8(rSTR2)
1283#endif
1284	srd	r0, rWORD2, rSHR
1285	.align	4
1286L(dutrim):
1287#ifdef __LITTLE_ENDIAN__
1288	ldbrx	rWORD1, 0, rSTR1
1289#else
1290	ld	rWORD1, 8(rSTR1)
1291#endif
1292	ld	rWORD8, -8(r1)
1293	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
1294	or	rWORD2, r0, rWORD8_SHIFT
1295	ld	rWORD7, -16(r1)
1296	ld	rSHL, -24(r1)
1297	srd	rWORD1, rWORD1, rN
1298	srd	rWORD2, rWORD2, rN
1299	ld	rSHR, -32(r1)
1300	ld	rWORD8_SHIFT, -40(r1)
1301	li	rRTN, 0
1302	cmpld	cr7, rWORD1, rWORD2
1303	ld	rWORD2_SHIFT, -48(r1)
1304	ld	rWORD4_SHIFT, -56(r1)
1305	beq	cr7, L(dureturn24)
1306	li	rRTN, 1
1307	ld	rWORD6_SHIFT, -64(r1)
1308	bgtlr	cr7
1309	li	rRTN, -1
1310	blr
1311	.align	4
1312L(duLcr7):
1313	ld	rWORD8, -8(r1)
1314	ld	rWORD7, -16(r1)
1315	li	rRTN, 1
1316	bgt	cr7, L(dureturn29)
1317	ld	rSHL, -24(r1)
1318	ld	rSHR, -32(r1)
1319	li	rRTN, -1
1320	b	L(dureturn27)
1321	.align	4
1322L(duLcr1):
1323	ld	rWORD8, -8(r1)
1324	ld	rWORD7, -16(r1)
1325	li	rRTN, 1
1326	bgt	cr1, L(dureturn29)
1327	ld	rSHL, -24(r1)
1328	ld	rSHR, -32(r1)
1329	li	rRTN, -1
1330	b	L(dureturn27)
1331	.align	4
1332L(duLcr6):
1333	ld	rWORD8, -8(r1)
1334	ld	rWORD7, -16(r1)
1335	li	rRTN, 1
1336	bgt	cr6, L(dureturn29)
1337	ld	rSHL, -24(r1)
1338	ld	rSHR, -32(r1)
1339	li	rRTN, -1
1340	b	L(dureturn27)
1341	.align	4
1342L(duLcr5):
1343	ld	rWORD8, -8(r1)
1344	ld	rWORD7, -16(r1)
1345	li	rRTN, 1
1346	bgt	cr5, L(dureturn29)
1347	ld	rSHL, -24(r1)
1348	ld	rSHR, -32(r1)
1349	li	rRTN, -1
1350	b	L(dureturn27)
1351	.align	3
1352L(duZeroReturn):
1353	li	rRTN, 0
1354	.align	4
1355L(dureturn):
1356	ld	rWORD8, -8(r1)
1357	ld	rWORD7, -16(r1)
1358L(dureturn29):
1359	ld	rSHL, -24(r1)
1360	ld	rSHR, -32(r1)
1361L(dureturn27):
1362	ld	rWORD8_SHIFT, -40(r1)
1363L(dureturn26):
1364	ld	rWORD2_SHIFT, -48(r1)
1365L(dureturn25):
1366	ld	rWORD4_SHIFT, -56(r1)
1367L(dureturn24):
1368	ld	rWORD6_SHIFT, -64(r1)
1369	blr
1370L(duzeroLength):
1371	li	rRTN, 0
1372	blr
1373
1374END (MEMCMP)
1375libc_hidden_builtin_def (memcmp)
1376weak_alias (memcmp, bcmp)
1377strong_alias (memcmp, __memcmpeq)
1378libc_hidden_def (__memcmpeq)
1379