1/* Optimized strcmp implementation for PowerPC64/POWER8. 2 Copyright (C) 2015-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef STRCMP 22# define STRCMP strcmp 23#endif 24 25/* Implements the function 26 27 size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) 28 29 The implementation uses unaligned doubleword access to avoid specialized 30 code paths depending of data alignment. Although recent powerpc64 uses 31 64K as default, the page cross handling assumes minimum page size of 32 4k. */ 33 34 .machine power8 35ENTRY_TOCLESS (STRCMP, 4) 36 li r0,0 37 38 /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using 39 the code: 40 41 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) 42 43 with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ 44 45 rldicl r7,r3,0,52 46 rldicl r9,r4,0,52 47 cmpldi cr7,r7,4096-16 48 bgt cr7,L(pagecross_check) 49 cmpldi cr5,r9,4096-16 50 bgt cr5,L(pagecross_check) 51 52 /* For short string up to 16 bytes, load both s1 and s2 using 53 unaligned dwords and compare. */ 54 ld r8,0(r3) 55 ld r10,0(r4) 56 cmpb r12,r8,r0 57 cmpb r11,r8,r10 58 orc. r9,r12,r11 59 bne cr0,L(different_nocmpb) 60 61 ld r8,8(r3) 62 ld r10,8(r4) 63 cmpb r12,r8,r0 64 cmpb r11,r8,r10 65 orc. r9,r12,r11 66 bne cr0,L(different_nocmpb) 67 68 addi r7,r3,16 69 addi r4,r4,16 70 71L(align_8b): 72 /* Now it has checked for first 16 bytes, align source1 to doubleword 73 and adjust source2 address. */ 74 rldicl r9,r7,0,61 /* source1 alignment to doubleword */ 75 subf r4,r9,r4 /* Adjust source2 address based on source1 76 alignment. */ 77 rldicr r7,r7,0,60 /* Align source1 to doubleword. */ 78 79 /* At this point, source1 alignment is 0 and source2 alignment is 80 between 0 and 7. Check is source2 alignment is 0, meaning both 81 sources have the same alignment. */ 82 andi. r9,r4,0x7 83 bne cr0,L(loop_diff_align) 84 85 /* If both source1 and source2 are doubleword aligned, there is no 86 need for page boundary cross checks. */ 87 88 ld r8,0(r7) 89 ld r10,0(r4) 90 cmpb r12,r8,r0 91 cmpb r11,r8,r10 92 orc. r9,r12,r11 93 bne cr0,L(different_nocmpb) 94 95 .align 4 96L(loop_equal_align): 97 ld r8,8(r7) 98 ld r10,8(r4) 99 cmpb r12,r8,r0 100 cmpb r11,r8,r10 101 orc. r9,r12,r11 102 bne cr0,L(different_nocmpb) 103 104 ld r8,16(r7) 105 ld r10,16(r4) 106 cmpb r12,r8,r0 107 cmpb r11,r8,r10 108 orc. r9,r12,r11 109 bne cr0,L(different_nocmpb) 110 111 ldu r8,24(r7) 112 ldu r10,24(r4) 113 cmpb r12,r8,r0 114 cmpb r11,r8,r10 115 orc. r9,r12,r11 116 bne cr0,L(different_nocmpb) 117 118 b L(loop_equal_align) 119 120 /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb 121 result and r10 the dword from s2. To code isolate the byte 122 up to end (including the '\0'), masking with 0xFF the remaining 123 ones: 124 125 #if __LITTLE_ENDIAN__ 126 (__builtin_ffsl (x) - 1) = counting trailing zero bits 127 r9 = (__builtin_ffsl (r9) - 1) + 8; 128 r9 = -1UL << r9 129 #else 130 r9 = __builtin_clzl (r9) + 8; 131 r9 = -1UL >> r9 132 #endif 133 r8 = r8 | r9 134 r10 = r10 | r9 */ 135 136#ifdef __LITTLE_ENDIAN__ 137 nor r9,r9,r9 138L(different_nocmpb): 139 neg r3,r9 140 and r9,r9,r3 141 cntlzd r9,r9 142 subfic r9,r9,63 143#else 144 not r9,r9 145L(different_nocmpb): 146 cntlzd r9,r9 147 subfic r9,r9,56 148#endif 149 srd r3,r8,r9 150 srd r10,r10,r9 151 rldicl r10,r10,0,56 152 rldicl r3,r3,0,56 153 subf r3,r10,r3 154 extsw r3,r3 155 blr 156 157 .align 4 158L(pagecross_check): 159 subfic r9,r9,4096 160 subfic r7,r7,4096 161 cmpld cr7,r7,r9 162 bge cr7,L(pagecross) 163 mr r7,r9 164 165 /* If unaligned 16 bytes reads across a 4K page boundary, it uses 166 a simple byte a byte comparison until the page alignment for s1 167 is reached. */ 168L(pagecross): 169 add r7,r3,r7 170 subf r9,r3,r7 171 mtctr r9 172 173 .align 4 174L(pagecross_loop): 175 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 176 and if *s1 is '\0'. */ 177 lbz r9,0(r3) 178 lbz r10,0(r4) 179 addi r3,r3,1 180 addi r4,r4,1 181 cmplw cr7,r9,r10 182 cmpdi cr5,r9,r0 183 bne cr7,L(pagecross_ne) 184 beq cr5,L(pagecross_nullfound) 185 bdnz L(pagecross_loop) 186 b L(align_8b) 187 188 .align 4 189 /* The unaligned read of source2 will cross a 4K page boundary, 190 and the different byte or NULL maybe be in the remaining page 191 bytes. Since it can not use the unaligned load, the algorithm 192 reads and compares 8 bytes to keep source1 doubleword aligned. */ 193L(check_source2_byte): 194 li r9,8 195 mtctr r9 196 197 .align 4 198L(check_source2_byte_loop): 199 lbz r9,0(r7) 200 lbz r10,0(r4) 201 addi r7,r7,1 202 addi r4,r4,1 203 cmplw cr7,r9,10 204 cmpdi r5,r9,0 205 bne cr7,L(pagecross_ne) 206 beq cr5,L(pagecross_nullfound) 207 bdnz L(check_source2_byte_loop) 208 209 /* If source2 is unaligned to doubleword, the code needs to check 210 on each interation if the unaligned doubleword access will cross 211 a 4k page boundary. */ 212 .align 5 213L(loop_unaligned): 214 ld r8,0(r7) 215 ld r10,0(r4) 216 cmpb r12,r8,r0 217 cmpb r11,r8,r10 218 orc. r9,r12,r11 219 bne cr0,L(different_nocmpb) 220 addi r7,r7,8 221 addi r4,r4,8 222 223L(loop_diff_align): 224 /* Check if [src2]+8 cross a 4k page boundary: 225 226 srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) 227 228 with PAGE_SIZE being 4096. */ 229 rldicl r9,r4,0,52 230 cmpldi cr7,r9,4088 231 ble cr7,L(loop_unaligned) 232 b L(check_source2_byte) 233 234 .align 4 235L(pagecross_ne): 236 extsw r3,r9 237 mr r9,r10 238L(pagecross_retdiff): 239 subf r9,r9,r3 240 extsw r3,r9 241 blr 242 243 .align 4 244L(pagecross_nullfound): 245 li r3,0 246 b L(pagecross_retdiff) 247END (STRCMP) 248libc_hidden_builtin_def (strcmp) 249