1/* Optimized strcmp implementation for PowerPC64/POWER9. 2 Copyright (C) 2016-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18#include <sysdep.h> 19 20#ifndef STRCMP 21# define STRCMP strcmp 22#endif 23 24/* Implements the function 25 26 int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) 27 28 The implementation uses unaligned doubleword access for first 32 bytes 29 as in POWER8 patch and uses vectorised loops after that. */ 30 31/* TODO: Change this to actual instructions when minimum binutils is upgraded 32 to 2.27. Macros are defined below for these newer instructions in order 33 to maintain compatibility. */ 34#define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21))) 35 36#define VEXTUBRX(t,a,b) .long (0x1000070d \ 37 | ((t)<<(32-11)) \ 38 | ((a)<<(32-16)) \ 39 | ((b)<<(32-21)) ) 40 41#define VCMPNEZB(t,a,b) .long (0x10000507 \ 42 | ((t)<<(32-11)) \ 43 | ((a)<<(32-16)) \ 44 | ((b)<<(32-21)) ) 45 46/* Get 16 bytes for unaligned case. 47 reg1: Vector to hold next 16 bytes. 48 reg2: Address to read from. 49 reg3: Permute control vector. */ 50#define GET16BYTES(reg1, reg2, reg3) \ 51 lvx reg1, 0, reg2; \ 52 vperm v8, v2, reg1, reg3; \ 53 vcmpequb. v8, v0, v8; \ 54 beq cr6, 1f; \ 55 vspltisb v9, 0; \ 56 b 2f; \ 57 .align 4; \ 581: \ 59 addi r6, reg2, 16; \ 60 lvx v9, 0, r6; \ 612: \ 62 vperm reg1, v9, reg1, reg3; 63 64/* TODO: change this to .machine power9 when the minimum required binutils 65 allows it. */ 66 67 .machine power7 68ENTRY_TOCLESS (STRCMP, 4) 69 li r0, 0 70 71 /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using 72 the code: 73 74 (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) 75 76 with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ 77 78 rldicl r7, r3, 0, 52 79 rldicl r9, r4, 0, 52 80 cmpldi cr7, r7, 4096-16 81 bgt cr7, L(pagecross_check) 82 cmpldi cr5, r9, 4096-16 83 bgt cr5, L(pagecross_check) 84 85 /* For short strings up to 16 bytes, load both s1 and s2 using 86 unaligned dwords and compare. */ 87 ld r8, 0(r3) 88 ld r10, 0(r4) 89 cmpb r12, r8, r0 90 cmpb r11, r8, r10 91 orc. r9, r12, r11 92 bne cr0, L(different_nocmpb) 93 94 ld r8, 8(r3) 95 ld r10, 8(r4) 96 cmpb r12, r8, r0 97 cmpb r11, r8, r10 98 orc. r9, r12, r11 99 bne cr0, L(different_nocmpb) 100 101 addi r7, r3, 16 102 addi r4, r4, 16 103 104L(align): 105 /* Now it has checked for first 16 bytes. */ 106 vspltisb v0, 0 107 vspltisb v2, -1 108 lvsr v6, 0, r4 /* Compute mask. */ 109 or r5, r4, r7 110 andi. r5, r5, 0xF 111 beq cr0, L(aligned) 112 andi. r5, r7, 0xF 113 beq cr0, L(s1_align) 114 lvsr v10, 0, r7 /* Compute mask. */ 115 116 /* Both s1 and s2 are unaligned. */ 117 GET16BYTES(v4, r7, v10) 118 GET16BYTES(v5, r4, v6) 119 VCMPNEZB(v7, v5, v4) 120 beq cr6, L(match) 121 b L(different) 122 123 /* Align s1 to qw and adjust s2 address. */ 124 .align 4 125L(match): 126 clrldi r6, r7, 60 127 subfic r5, r6, 16 128 add r7, r7, r5 129 add r4, r4, r5 130 andi. r5, r4, 0xF 131 beq cr0, L(aligned) 132 lvsr v6, 0, r4 133 /* There are 2 loops depending on the input alignment. 134 Each loop gets 16 bytes from s1 and s2 and compares. 135 Loop until a mismatch or null occurs. */ 136L(s1_align): 137 lvx v4, r7, r0 138 GET16BYTES(v5, r4, v6) 139 VCMPNEZB(v7, v5, v4) 140 addi r7, r7, 16 141 addi r4, r4, 16 142 bne cr6, L(different) 143 144 lvx v4, r7, r0 145 GET16BYTES(v5, r4, v6) 146 VCMPNEZB(v7, v5, v4) 147 addi r7, r7, 16 148 addi r4, r4, 16 149 bne cr6, L(different) 150 151 lvx v4, r7, r0 152 GET16BYTES(v5, r4, v6) 153 VCMPNEZB(v7, v5, v4) 154 addi r7, r7, 16 155 addi r4, r4, 16 156 bne cr6, L(different) 157 158 lvx v4, r7, r0 159 GET16BYTES(v5, r4, v6) 160 VCMPNEZB(v7, v5, v4) 161 addi r7, r7, 16 162 addi r4, r4, 16 163 beq cr6, L(s1_align) 164 b L(different) 165 166 .align 4 167L(aligned): 168 lvx v4, 0, r7 169 lvx v5, 0, r4 170 VCMPNEZB(v7, v5, v4) 171 addi r7, r7, 16 172 addi r4, r4, 16 173 bne cr6, L(different) 174 175 lvx v4, 0, r7 176 lvx v5, 0, r4 177 VCMPNEZB(v7, v5, v4) 178 addi r7, r7, 16 179 addi r4, r4, 16 180 bne cr6, L(different) 181 182 lvx v4, 0, r7 183 lvx v5, 0, r4 184 VCMPNEZB(v7, v5, v4) 185 addi r7, r7, 16 186 addi r4, r4, 16 187 bne cr6, L(different) 188 189 lvx v4, 0, r7 190 lvx v5, 0, r4 191 VCMPNEZB(v7, v5, v4) 192 addi r7, r7, 16 193 addi r4, r4, 16 194 beq cr6, L(aligned) 195 196 /* Calculate and return the difference. */ 197L(different): 198 VCTZLSBB(r6, v7) 199 VEXTUBRX(r5, r6, v4) 200 VEXTUBRX(r4, r6, v5) 201 subf r3, r4, r5 202 extsw r3, r3 203 blr 204 205 .align 4 206L(different_nocmpb): 207 neg r3, r9 208 and r9, r9, r3 209 cntlzd r9, r9 210 subfic r9, r9, 63 211 srd r3, r8, r9 212 srd r10, r10, r9 213 rldicl r10, r10, 0, 56 214 rldicl r3, r3, 0, 56 215 subf r3, r10, r3 216 extsw r3, r3 217 blr 218 219 .align 4 220L(pagecross_check): 221 subfic r9, r9, 4096 222 subfic r7, r7, 4096 223 cmpld cr7, r7, r9 224 bge cr7, L(pagecross) 225 mr r7, r9 226 227 /* If unaligned 16 bytes reads across a 4K page boundary, it uses 228 a simple byte a byte comparison until the page alignment for s1 229 is reached. */ 230L(pagecross): 231 add r7, r3, r7 232 subf r9, r3, r7 233 mtctr r9 234 235 .align 4 236L(pagecross_loop): 237 /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 238 and if *s1 is '\0'. */ 239 lbz r9, 0(r3) 240 lbz r10, 0(r4) 241 addi r3, r3, 1 242 addi r4, r4, 1 243 cmplw cr7, r9, r10 244 cmpdi cr5, r9, r0 245 bne cr7, L(pagecross_ne) 246 beq cr5, L(pagecross_nullfound) 247 bdnz L(pagecross_loop) 248 b L(align) 249 250 .align 4 251L(pagecross_ne): 252 extsw r3, r9 253 mr r9, r10 254L(pagecross_retdiff): 255 subf r9, r9, r3 256 extsw r3, r9 257 blr 258 259 .align 4 260L(pagecross_nullfound): 261 li r3, 0 262 b L(pagecross_retdiff) 263END (STRCMP) 264libc_hidden_builtin_def (strcmp) 265