1/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* char *[r3] strrchr (char *s [r3], int c [r4]) */ 22 23#ifdef __LITTLE_ENDIAN__ 24/* Find the match position from v6 and place result in r6. */ 25# define CALCULATE_MATCH() \ 26 vbpermq v6, v6, v10; \ 27 vsldoi v6, v6, v6, 6; \ 28 mfvrd r7, v6; \ 29 cntlzd r6, r7; \ 30 subfic r6, r6, 15; 31/* 32 * Find the first null position to mask bytes after null. 33 * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw. 34 * Result placed at v2. 35 */ 36# define FIND_NULL_POS(reg) \ 37 vspltisb v11, -1; \ 38 vadduqm v11, reg, v11; \ 39 vandc v11, v11, reg; \ 40 vpopcntd v2, v11; \ 41 vspltb v11, v2, 15; \ 42 vcmpequb. v11, v11, v9; \ 43 blt cr6, 1f; \ 44 vsldoi v9, v0, v9, 1; \ 45 vslo v2, v2, v9; \ 461: \ 47 vsumsws v2, v2, v0; 48#else 49# define CALCULATE_MATCH() \ 50 vbpermq v6, v6, v10; \ 51 mfvrd r7, v6; \ 52 addi r6, r7, -1; \ 53 andc r6, r6, r7; \ 54 popcntd r6, r6; \ 55 subfic r6, r6, 15; 56# define FIND_NULL_POS(reg) \ 57 vclzd v2, reg; \ 58 vspltb v11, v2, 7; \ 59 vcmpequb. v11, v11, v9; \ 60 blt cr6, 1f; \ 61 vsldoi v9, v0, v9, 1; \ 62 vsro v2, v2, v9; \ 631: \ 64 vsumsws v2, v2, v0; 65#endif /* !__LITTLE_ENDIAN__ */ 66 67#ifndef STRRCHR 68# define STRRCHR strrchr 69#endif 70 .machine power8 71ENTRY_TOCLESS (STRRCHR) 72 CALL_MCOUNT 2 73 dcbt 0,r3 74 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ 75 cmpdi cr7,r4,0 76 ld r12,0(r8) /* Load doubleword from memory. */ 77 li r9,0 /* Used to store last occurence. */ 78 li r0,0 /* Doubleword with null chars to use 79 with cmpb. */ 80 81 rlwinm r6,r3,3,26,28 /* Calculate padding. */ 82 83 beq cr7,L(null_match) 84 85 /* Replicate byte to doubleword. */ 86 insrdi r4,r4,8,48 87 insrdi r4,r4,16,32 88 insrdi r4,r4,32,0 89 90 /* r4 is changed now. If it's passed more chars, then 91 check for null again. */ 92 cmpdi cr7,r4,0 93 beq cr7,L(null_match) 94 /* Now r4 has a doubleword of c bytes and r0 has 95 a doubleword of null bytes. */ 96 97 cmpb r10,r12,r4 /* Compare each byte against c byte. */ 98 cmpb r11,r12,r0 /* Compare each byte against null byte. */ 99 100 /* Move the doublewords left and right to discard the bits that are 101 not part of the string and bring them back as zeros. */ 102#ifdef __LITTLE_ENDIAN__ 103 srd r10,r10,r6 104 srd r11,r11,r6 105 sld r10,r10,r6 106 sld r11,r11,r6 107#else 108 sld r10,r10,r6 109 sld r11,r11,r6 110 srd r10,r10,r6 111 srd r11,r11,r6 112#endif 113 or r5,r10,r11 /* OR the results to speed things up. */ 114 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes 115 have been found. */ 116 bne cr7,L(done) 117 118L(align): 119 andi. r12, r8, 15 120 121 /* Are we now aligned to a doubleword boundary? If so, skip to 122 the main loop. Otherwise, go through the alignment code. */ 123 124 bne cr0, L(loop) 125 126 /* Handle WORD2 of pair. */ 127 ldu r12,8(r8) 128 cmpb r10,r12,r4 129 cmpb r11,r12,r0 130 or r5,r10,r11 131 cmpdi cr7,r5,0 132 bne cr7,L(done) 133 b L(loop) /* We branch here (rather than falling through) 134 to skip the nops due to heavy alignment 135 of the loop below. */ 136 .p2align 5 137L(loop): 138 /* Load two doublewords, compare and merge in a 139 single register for speed. This is an attempt 140 to speed up the null-checking process for bigger strings. */ 141 ld r12,8(r8) 142 ldu r7,16(r8) 143 cmpb r10,r12,r4 144 cmpb r11,r12,r0 145 cmpb r6,r7,r4 146 cmpb r7,r7,r0 147 or r12,r10,r11 148 or r5,r6,r7 149 or r5,r12,r5 150 cmpdi cr7,r5,0 151 beq cr7,L(vector) 152 153 /* OK, one (or both) of the doublewords contains a c/null byte. Check 154 the first doubleword and decrement the address in case the first 155 doubleword really contains a c/null byte. */ 156 cmpdi cr6,r12,0 157 addi r8,r8,-8 158 bne cr6,L(done) 159 160 /* The c/null byte must be in the second doubleword. Adjust the 161 address again and move the result of cmpb to r10 so we can calculate 162 the pointer. */ 163 164 mr r10,r6 165 mr r11,r7 166 addi r8,r8,8 167 168 /* r10/r11 have the output of the cmpb instructions, that is, 169 0xff in the same position as the c/null byte in the original 170 doubleword from the string. Use that to calculate the pointer. */ 171 172L(done): 173 /* If there are more than one 0xff in r11, find the first position of 174 0xff in r11 and fill r10 with 0 from that position. */ 175 cmpdi cr7,r11,0 176 beq cr7,L(no_null) 177#ifdef __LITTLE_ENDIAN__ 178 addi r3,r11,-1 179 andc r3,r3,r11 180 popcntd r0,r3 181#else 182 cntlzd r0,r11 183#endif 184 subfic r0,r0,63 185 li r6,-1 186#ifdef __LITTLE_ENDIAN__ 187 srd r0,r6,r0 188#else 189 sld r0,r6,r0 190#endif 191 and r10,r0,r10 192L(no_null): 193#ifdef __LITTLE_ENDIAN__ 194 cntlzd r0,r10 /* Count leading zeros before c matches. */ 195 addi r3,r10,-1 196 andc r3,r3,r10 197 addi r10,r11,-1 198 andc r10,r10,r11 199 cmpld cr7,r3,r10 200 bgt cr7,L(no_match) 201#else 202 addi r3,r10,-1 /* Count trailing zeros before c matches. */ 203 andc r3,r3,r10 204 popcntd r0,r3 205 cmpld cr7,r11,r10 206 bgt cr7,L(no_match) 207#endif 208 srdi r0,r0,3 /* Convert trailing zeros to bytes. */ 209 subfic r0,r0,7 210 add r9,r8,r0 /* Return address of the matching c byte 211 or null in case c was not found. */ 212 li r0,0 213 cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ 214 beq cr7,L(align) 215 216 .align 4 217L(no_match): 218 mr r3,r9 219 blr 220 221/* Check the first 32B in GPR's and move to vectorized loop. */ 222 .p2align 5 223L(vector): 224 addi r3, r8, 8 225 /* Make sure 32B aligned. */ 226 andi. r10, r3, 31 227 bne cr0, L(loop) 228 vspltisb v0, 0 229 /* Precompute vbpermq constant. */ 230 vspltisb v10, 3 231 lvsl v11, r0, r0 232 vslb v10, v11, v10 233 mtvrd v1, r4 234 li r5, 16 235 vspltb v1, v1, 7 236 /* Compare 32 bytes in each loop. */ 237L(continue): 238 lvx v4, 0, r3 239 lvx v5, r3, r5 240 vcmpequb v2, v0, v4 241 vcmpequb v3, v0, v5 242 vcmpequb v6, v1, v4 243 vcmpequb v7, v1, v5 244 vor v8, v2, v3 245 vor v9, v6, v7 246 vor v11, v8, v9 247 vcmpequb. v11, v0, v11 248 addi r3, r3, 32 249 blt cr6, L(continue) 250 vcmpequb. v8, v0, v8 251 blt cr6, L(match) 252 253 /* One (or both) of the quadwords contains c/null. */ 254 vspltisb v8, 2 255 vspltisb v9, 5 256 /* Precompute values used for comparison. */ 257 vsl v9, v8, v9 /* v9 = 0x4040404040404040. */ 258 vaddubm v8, v9, v9 259 vsldoi v8, v0, v8, 1 /* v8 = 0x80. */ 260 261 /* Check if null is in second qw. */ 262 vcmpequb. v11, v0, v2 263 blt cr6, L(secondqw) 264 265 /* Null found in first qw. */ 266 addi r8, r3, -32 267 /* Calculate the null position. */ 268 FIND_NULL_POS(v2) 269 /* Check if null is in the first byte. */ 270 vcmpequb. v11, v0, v2 271 blt cr6, L(no_match) 272 vsububm v2, v8, v2 273 /* Mask unwanted bytes after null. */ 274#ifdef __LITTLE_ENDIAN__ 275 vslo v6, v6, v2 276 vsro v6, v6, v2 277#else 278 vsro v6, v6, v2 279 vslo v6, v6, v2 280#endif 281 vcmpequb. v11, v0, v6 282 blt cr6, L(no_match) 283 /* Found a match before null. */ 284 CALCULATE_MATCH() 285 add r3, r8, r6 286 blr 287 288L(secondqw): 289 addi r8, r3, -16 290 FIND_NULL_POS(v3) 291 vcmpequb. v11, v0, v2 292 blt cr6, L(no_match1) 293 vsububm v2, v8, v2 294 /* Mask unwanted bytes after null. */ 295#ifdef __LITTLE_ENDIAN__ 296 vslo v7, v7, v2 297 vsro v7, v7, v2 298#else 299 vsro v7, v7, v2 300 vslo v7, v7, v2 301#endif 302 vcmpequb. v11, v0, v7 303 blt cr6, L(no_match1) 304 addi r8, r8, 16 305 vor v6, v0, v7 306L(no_match1): 307 addi r8, r8, -16 308 vcmpequb. v11, v0, v6 309 blt cr6, L(no_match) 310 /* Found a match before null. */ 311 CALCULATE_MATCH() 312 add r3, r8, r6 313 blr 314 315L(match): 316 /* One (or both) of the quadwords contains a match. */ 317 mr r8, r3 318 vcmpequb. v8, v0, v7 319 blt cr6, L(firstqw) 320 /* Match found in second qw. */ 321 addi r8, r8, 16 322 vor v6, v0, v7 323L(firstqw): 324 addi r8, r8, -32 325 CALCULATE_MATCH() 326 add r9, r8, r6 /* Compute final length. */ 327 b L(continue) 328/* We are here because strrchr was called with a null byte. */ 329 .align 4 330L(null_match): 331 /* r0 has a doubleword of null bytes. */ 332 333 cmpb r5,r12,r0 /* Compare each byte against null bytes. */ 334 335 /* Move the doublewords left and right to discard the bits that are 336 not part of the string and bring them back as zeros. */ 337#ifdef __LITTLE_ENDIAN__ 338 srd r5,r5,r6 339 sld r5,r5,r6 340#else 341 sld r5,r5,r6 342 srd r5,r5,r6 343#endif 344 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes 345 have been found. */ 346 bne cr7,L(done_null) 347 348 andi. r12, r8, 15 349 350 /* Are we now aligned to a quadword boundary? If so, skip to 351 the main loop. Otherwise, go through the alignment code. */ 352 353 bne cr0, L(loop_null) 354 355 /* Handle WORD2 of pair. */ 356 ldu r12,8(r8) 357 cmpb r5,r12,r0 358 cmpdi cr7,r5,0 359 bne cr7,L(done_null) 360 b L(loop_null) /* We branch here (rather than falling through) 361 to skip the nops due to heavy alignment 362 of the loop below. */ 363 364 /* Main loop to look for the end of the string. Since it's a 365 small loop (< 8 instructions), align it to 32-bytes. */ 366 .p2align 5 367L(loop_null): 368 /* Load two doublewords, compare and merge in a 369 single register for speed. This is an attempt 370 to speed up the null-checking process for bigger strings. */ 371 ld r12,8(r8) 372 ldu r11,16(r8) 373 cmpb r5,r12,r0 374 cmpb r10,r11,r0 375 or r6,r5,r10 376 cmpdi cr7,r6,0 377 beq cr7,L(vector1) 378 379 /* OK, one (or both) of the doublewords contains a null byte. Check 380 the first doubleword and decrement the address in case the first 381 doubleword really contains a null byte. */ 382 383 cmpdi cr6,r5,0 384 addi r8,r8,-8 385 bne cr6,L(done_null) 386 387 /* The null byte must be in the second doubleword. Adjust the address 388 again and move the result of cmpb to r10 so we can calculate the 389 pointer. */ 390 391 mr r5,r10 392 addi r8,r8,8 393 394 /* r5 has the output of the cmpb instruction, that is, it contains 395 0xff in the same position as the null byte in the original 396 doubleword from the string. Use that to calculate the pointer. */ 397L(done_null): 398#ifdef __LITTLE_ENDIAN__ 399 addi r0,r5,-1 400 andc r0,r0,r5 401 popcntd r0,r0 402#else 403 cntlzd r0,r5 /* Count leading zeros before the match. */ 404#endif 405 srdi r0,r0,3 /* Convert trailing zeros to bytes. */ 406 add r3,r8,r0 /* Return address of the matching null byte. */ 407 blr 408/* Check the first 32B in GPR's and move to vectorized loop. */ 409 .p2align 5 410L(vector1): 411 addi r3, r8, 8 412 /* Make sure 32B aligned. */ 413 andi. r10, r3, 31 414 bne cr0, L(loop_null) 415 vspltisb v0, 0 416 /* Precompute vbpermq constant. */ 417 vspltisb v10, 3 418 lvsl v11, r0, r0 419 vslb v10, v11, v10 420 li r5, 16 421 /* Compare 32 bytes in each loop. */ 422L(continue1): 423 lvx v4, 0, r3 424 lvx v5, r3, r5 425 vcmpequb v2, v0, v4 426 vcmpequb v3, v0, v5 427 vor v8, v2, v3 428 vcmpequb. v11, v0, v8 429 addi r3, r3, 32 430 blt cr6, L(continue1) 431 addi r3, r3, -32 432 vbpermq v2, v2, v10 433 vbpermq v3, v3, v10 434 /* Shift each component into its correct position for merging. */ 435#ifdef __LITTLE_ENDIAN__ 436 vsldoi v3, v3, v3, 2 437#else 438 vsldoi v2, v2, v2, 6 439 vsldoi v3, v3, v3, 4 440#endif 441 /* Merge the results and move to a GPR. */ 442 vor v4, v3, v2 443 mfvrd r5, v4 444#ifdef __LITTLE_ENDIAN__ 445 addi r6, r5, -1 446 andc r6, r6, r5 447 popcntd r6, r6 448#else 449 cntlzd r6, r5 /* Count leading zeros before the match. */ 450#endif 451 add r3, r3, r6 /* Compute final length. */ 452 blr 453END_GEN_TB (STRRCHR, TB_TOCLESS) 454weak_alias (strrchr, rindex) 455libc_hidden_builtin_def (strrchr) 456