1/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* The optimization is achieved here through cmpb instruction.
20   8byte aligned strings are processed with double word comparision
21   and unaligned strings are handled effectively with loop unrolling
22   technique  */
23
24#include <sysdep.h>
25
26#ifndef STRCMP
27# define STRCMP strcmp
28#endif
29
30/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
31
32	.machine	power7
33ENTRY_TOCLESS (STRCMP, 4)
34	CALL_MCOUNT 2
35
36	or r9, r3, r4
37	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
38	bne cr0, L(process_unaligned_bytes)
39	li	r5, 0
40
41	.align 4
42/* process input parameters on double word aligned boundary  */
43L(unrollDword):
44	ld	r8,0(r3)
45	ld	r10,0(r4)
46	cmpb	r7,r8,r5
47	cmpdi	cr7,r7,0
48	mr	r9,r7
49	bne 	cr7,L(null_found)
50	cmpld	cr7,r8,r10
51	bne	cr7,L(different)
52
53	ld	r8,8(r3)
54	ld	r10,8(r4)
55	cmpb	r7,r8,r5
56	cmpdi	cr7,r7,0
57	mr	r9,r7
58	bne 	cr7,L(null_found)
59	cmpld	cr7,r8,r10
60	bne	cr7,L(different)
61
62	ld	r8,16(r3)
63	ld	r10,16(r4)
64	cmpb	r7,r8,r5
65	cmpdi	cr7,r7,0
66	mr	r9,r7
67	bne 	cr7,L(null_found)
68	cmpld	cr7,r8,r10
69	bne	cr7,L(different)
70
71	ld	r8,24(r3)
72	ld	r10,24(r4)
73	cmpb	r7,r8,r5
74	cmpdi	cr7,r7,0
75	mr	r9,r7
76	bne 	cr7,L(null_found)
77	cmpld	cr7,r8,r10
78	bne	cr7,L(different)
79
80	addi r3, r3, 32
81	addi r4, r4, 32
82	beq cr7, L(unrollDword)
83
84	.align 4
85L(null_found):
86#ifdef __LITTLE_ENDIAN__
87	neg	r7,r9
88	and	r9,r9,r7
89	li	r7,-1
90	cntlzd	r9,r9
91	subfic	r9,r9,71
92	sld	r9,r7,r9
93#else
94	cntlzd	r9,r9
95	li	r7,-1
96	addi	r9,r9,8
97	srd	r9,r7,r9
98#endif
99	or	r8,r8,r9
100	or	r10,r10,r9
101
102L(different):
103	cmpb	r9,r8,r10
104#ifdef __LITTLE_ENDIAN__
105	addi	r7,r9,1
106	andc	r9,r7,r9
107	cntlzd	r9,r9
108	subfic	r9,r9,63
109#else
110	not	r9,r9
111	cntlzd	r9,r9
112	subfic	r9,r9,56
113#endif
114	srd	r3,r8,r9
115	srd	r10,r10,r9
116	rldicl	r10,r10,0,56
117	rldicl	r3,r3,0,56
118	subf	r3,r10,r3
119	blr
120
121	.align 4
122L(process_unaligned_bytes):
123	lbz r9, 0(r3)		/* load byte from s1  */
124	lbz r10, 0(r4)		/* load byte from s2  */
125	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
126	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
127	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
128	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
129
130	lbz r9, 1(r3)		/* load next byte from s1  */
131	lbz r10, 1(r4)		/* load next byte from s2  */
132	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
133	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
134	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
135	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
136
137	lbz r9, 2(r3)		/* unroll 3rd byte here  */
138	lbz r10, 2(r4)
139	cmpdi cr7, r9, 0
140	beq cr7, L(diffOfNULL)
141	cmplw cr7, r9, r10
142	bne 7, L(ComputeDiff)
143
144	lbz r9, 3(r3)		/* unroll 4th byte now  */
145	lbz r10, 3(r4)
146	addi r3, r3, 4		/* increment s1 by unroll factor  */
147	cmpdi cr7, r9, 0
148	cmplw cr6, 9, r10
149	beq cr7, L(diffOfNULL)
150	addi r4, r4, 4		/* increment s2 by unroll factor  */
151	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
152
153	.align 4
154L(ComputeDiff):
155	extsw r9, r9
156	subf r10, r10, r9	/* compute s1 - s2  */
157	extsw r3, r10
158	blr			/* return  */
159
160	.align 4
161L(diffOfNULL):
162	li r9, 0
163	subf r10, r10, r9	/* compute s1 - s2  */
164	extsw r3, r10		/* sign extend result  */
165	blr			/* return  */
166
167END (STRCMP)
168libc_hidden_builtin_def (strcmp)
169