1/* Optimized memrchr implementation for PowerPC64/POWER8. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */ 22 23#ifndef MEMRCHR 24# define MEMRCHR __memrchr 25#endif 26 .machine power8 27ENTRY_TOCLESS (MEMRCHR) 28 CALL_MCOUNT 3 29 add r7, r3, r5 /* Calculate the last acceptable address. */ 30 neg r0, r7 31 addi r7, r7, -1 32 mr r10, r3 33 clrrdi r6, r7, 7 34 li r9, 3<<5 35 dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */ 36 37 /* Replicate BYTE to doubleword. */ 38 insrdi r4, r4, 8, 48 39 insrdi r4, r4, 16, 32 40 insrdi r4, r4, 32, 0 41 li r6, -8 42 li r9, -1 43 rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */ 44 clrrdi r8, r7, 3 45 srd r9, r9, r0 46 cmpldi r5, 32 47 clrrdi r0, r10, 3 48 ble L(small_range) 49 50#ifdef __LITTLE_ENDIAN__ 51 ldx r12, 0, r8 52#else 53 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */ 54#endif 55 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */ 56 and r3, r3, r9 57 cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */ 58 bne cr7, L(done) 59 60 /* Are we now aligned to a quadword boundary? If so, skip to 61 the main loop. Otherwise, go through the alignment code. */ 62 andi. r12, r8, 15 63 beq cr0, L(align_qw) 64 65 /* Handle DWORD2 of pair. */ 66#ifdef __LITTLE_ENDIAN__ 67 ldx r12, r8, r6 68#else 69 ldbrx r12, r8, r6 70#endif 71 addi r8, r8, -8 72 cmpb r3, r12, r4 73 cmpldi cr7, r3, 0 74 bne cr7, L(done) 75 76 .align 4 77 /* At this point, r8 is 16B aligned. */ 78L(align_qw): 79 sub r5, r8, r0 80 vspltisb v0, 0 81 /* Precompute vbpermq constant. */ 82 vspltisb v10, 3 83 li r0, 0 84 lvsl v11, r0, r0 85 vslb v10, v11, v10 86 mtvrd v1, r4 87 vspltb v1, v1, 7 88 cmpldi r5, 64 89 ble L(tail64) 90 /* Are we 64-byte aligned? If so, jump to the vectorized loop. 91 Note: aligning to 64-byte will necessarily slow down performance for 92 strings around 64 bytes in length due to the extra comparisons 93 required to check alignment for the vectorized loop. This is a 94 necessary tradeoff we are willing to take in order to speed up the 95 calculation for larger strings. */ 96 andi. r11, r8, 63 97 beq cr0, L(preloop_64B) 98 /* In order to begin the 64B loop, it needs to be 64 99 bytes aligned. So read until it is 64B aligned. */ 100 addi r8, r8, -16 101 lvx v4, 0, r8 102 vcmpequb v6, v1, v4 103 vcmpequb. v11, v0, v6 104 bnl cr6, L(found_16B) 105 addi r5, r5, -16 106 107 andi. r11, r8, 63 108 beq cr0, L(preloop_64B) 109 addi r8, r8, -16 110 lvx v4, 0, r8 111 vcmpequb v6, v1, v4 112 vcmpequb. v11, v0, v6 113 bnl cr6, L(found_16B) 114 addi r5, r5, -16 115 116 andi. r11, r8, 63 117 beq cr0, L(preloop_64B) 118 addi r8, r8, -16 119 lvx v4, 0, r8 120 vcmpequb v6, v1, v4 121 vcmpequb. v11, v0, v6 122 bnl cr6, L(found_16B) 123 addi r5, r5, -16 124 /* At this point it should be 64B aligned. 125 Prepare for the 64B loop. */ 126L(preloop_64B): 127 cmpldi r5, 64 /* Check if r5 < 64. */ 128 ble L(tail64) 129 srdi r9, r5, 6 /* Number of loop iterations. */ 130 mtctr r9 /* Setup the counter. */ 131 li r11, 16 /* Load required offsets. */ 132 li r9, 32 133 li r7, 48 134 135 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */ 136 .align 4 137L(loop): 138 addi r8, r8, -64 /* Adjust address for the next iteration. */ 139 lvx v2, 0, r8 /* Load 4 quadwords. */ 140 lvx v3, r8, r11 141 lvx v4, v8, r9 142 lvx v5, v8, r7 143 vcmpequb v6, v1, v2 144 vcmpequb v7, v1, v3 145 vcmpequb v8, v1, v4 146 vcmpequb v9, v1, v5 147 vor v11, v6, v7 148 vor v12, v8, v9 149 vor v11, v11, v12 /* Compare and merge into one VR for speed. */ 150 vcmpequb. v11, v0, v11 151 bnl cr6, L(found) 152 bdnz L(loop) 153 clrldi r5, r5, 58 154 155 /* Handle remainder of 64B loop or r5 > 64. */ 156 .align 4 157L(tail64): 158 cmpldi r5, 0 159 beq L(null) 160 addi r8, r8, -16 161 lvx v4, 0, r8 162 vcmpequb v6, v1, v4 163 vcmpequb. v11, v0, v6 164 bnl cr6, L(found_16B) 165 cmpldi cr6, r5, 16 166 ble cr6, L(null) 167 addi r5, r5, -16 168 169 addi r8, r8, -16 170 lvx v4, 0, r8 171 vcmpequb v6, v1, v4 172 vcmpequb. v11, v0, v6 173 bnl cr6, L(found_16B) 174 cmpldi cr6, r5, 16 175 ble cr6, L(null) 176 addi r5, r5, -16 177 178 addi r8, r8, -16 179 lvx v4, 0, r8 180 vcmpequb v6, v1, v4 181 vcmpequb. v11, v0, v6 182 bnl cr6, L(found_16B) 183 cmpldi cr6, r5, 16 184 ble cr6, L(null) 185 addi r5, r5, -16 186 187 addi r8, r8, -16 188 lvx v4, 0, r8 189 vcmpequb v6, v1, v4 190 vcmpequb. v11, v0, v6 191 bnl cr6, L(found_16B) 192 li r3, 0 193 blr 194 195 /* Found a match in 64B loop. */ 196 .align 4 197L(found): 198 /* Permute the first bit of each byte into bits 48-63. */ 199 vbpermq v6, v6, v10 200 vbpermq v7, v7, v10 201 vbpermq v8, v8, v10 202 vbpermq v9, v9, v10 203 /* Shift each component into its correct position for merging. */ 204#ifdef __LITTLE_ENDIAN__ 205 vsldoi v7, v7, v7, 2 206 vsldoi v8, v8, v8, 4 207 vsldoi v9, v9, v9, 6 208#else 209 vsldoi v6, v6, v6, 6 210 vsldoi v7, v7, v7, 4 211 vsldoi v8, v8, v8, 2 212#endif 213 /* Merge the results and move to a GPR. */ 214 vor v11, v6, v7 215 vor v4, v9, v8 216 vor v4, v11, v4 217 mfvrd r5, v4 218#ifdef __LITTLE_ENDIAN__ 219 cntlzd r6, r5 /* Count leading zeros before the match. */ 220#else 221 addi r6, r5, -1 222 andc r6, r6, r5 223 popcntd r6, r6 224#endif 225 addi r8, r8, 63 226 sub r3, r8, r6 /* Compute final address. */ 227 cmpld cr7, r3, r10 228 bgelr cr7 229 li r3, 0 230 blr 231 232 /* Found a match in last 16 bytes. */ 233 .align 4 234L(found_16B): 235 cmpld r8, r10 /* Are we on the last QW? */ 236 bge L(last) 237 /* Now discard bytes before starting address. */ 238 sub r9, r10, r8 239 mtvrd v9, r9 240 vspltisb v8, 3 241 /* Mask unwanted bytes. */ 242#ifdef __LITTLE_ENDIAN__ 243 lvsr v7, 0, r10 244 vperm v6, v0, v6, v7 245 vsldoi v9, v0, v9, 8 246 vsl v9, v9, v8 247 vslo v6, v6, v9 248#else 249 lvsl v7, 0, r10 250 vperm v6, v6, v0, v7 251 vsldoi v9, v0, v9, 8 252 vsl v9, v9, v8 253 vsro v6, v6, v9 254#endif 255L(last): 256 /* Permute the first bit of each byte into bits 48-63. */ 257 vbpermq v6, v6, v10 258 /* Shift each component into its correct position for merging. */ 259#ifdef __LITTLE_ENDIAN__ 260 vsldoi v6, v6, v6, 6 261 mfvrd r7, v6 262 cntlzd r6, r7 /* Count leading zeros before the match. */ 263#else 264 mfvrd r7, v6 265 addi r6, r7, -1 266 andc r6, r6, r7 267 popcntd r6, r6 268#endif 269 addi r8, r8, 15 270 sub r3, r8, r6 /* Compute final address. */ 271 cmpld r6, r5 272 bltlr 273 li r3, 0 274 blr 275 276 /* r3 has the output of the cmpb instruction, that is, it contains 277 0xff in the same position as BYTE in the original 278 word from the string. Use that to calculate the pointer. 279 We need to make sure BYTE is *before* the end of the 280 range. */ 281L(done): 282 cntlzd r9, r3 /* Count leading zeros before the match. */ 283 cmpld r8, r0 /* Are we on the last word? */ 284 srdi r6, r9, 3 /* Convert leading zeros to bytes. */ 285 addi r0, r6, -7 286 sub r3, r8, r0 287 cmpld cr7, r3, r10 288 bnelr 289 bgelr cr7 290 li r3, 0 291 blr 292 293 .align 4 294L(null): 295 li r3, 0 296 blr 297 298/* Deals with size <= 32. */ 299 .align 4 300L(small_range): 301 cmpldi r5, 0 302 beq L(null) 303 304#ifdef __LITTLE_ENDIAN__ 305 ldx r12, 0, r8 306#else 307 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */ 308#endif 309 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */ 310 and r3, r3, r9 311 cmpldi cr7, r3, 0 312 bne cr7, L(done) 313 314 /* Are we done already? */ 315 cmpld r8, r0 316 addi r8, r8, -8 317 beqlr 318 319 .align 5 320L(loop_small): 321#ifdef __LITTLE_ENDIAN__ 322 ldx r12, 0, r8 323#else 324 ldbrx r12, 0, r8 325#endif 326 cmpb r3, r12, r4 327 cmpld r8, r0 328 cmpldi cr7, r3, 0 329 bne cr7, L(done) 330 addi r8, r8, -8 331 bne L(loop_small) 332 blr 333 334END (MEMRCHR) 335weak_alias (__memrchr, memrchr) 336libc_hidden_builtin_def (memrchr) 337