1/* Optimized strspn implementation for Power8. 2 3 Copyright (C) 2016-2022 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20/* size_t [r3] strspn (const char *string [r3], 21 const char *needleAccept [r4]) */ 22 23/* This takes a novel approach by computing a 256 bit mask whereby 24 each set bit implies the byte is "accepted". P8 vector hardware 25 has extremely efficient hardware for selecting bits from a mask. 26 27 One might ask "why not use bpermd for short strings"? It is 28 so slow that its performance about matches the generic PPC64 29 variant without any fancy masking, with the added expense of 30 making the mask. That was the first variant of this. */ 31 32 33 34#include "sysdep.h" 35 36#ifndef USE_AS_STRCSPN 37# define USE_AS_STRCSPN 0 38# ifndef STRSPN 39# define STRSPN strspn 40# endif 41# define INITIAL_MASK 0 42# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB 43#else 44# ifndef STRSPN 45# define STRSPN strcspn 46# endif 47# define INITIAL_MASK -1 48# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB 49#endif 50 51/* Simple macro to use VSX instructions in overlapping VR's. */ 52#define XXVR(insn, vrt, vra, vrb) \ 53 insn 32+vrt, 32+vra, 32+vrb 54 55 .machine power8 56ENTRY_TOCLESS (STRSPN, 4) 57 CALL_MCOUNT 2 58 59 /* Generate useful constants for later on. */ 60 vspltisb v1, 7 61 vspltisb v2, -1 62 vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ 63 vspltisb v10, 0 64 vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ 65 XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ 66 67 /* Prepare to compute 256b mask. */ 68 addi r4, r4, -1 69 li r5, INITIAL_MASK 70 li r6, INITIAL_MASK 71 li r7, INITIAL_MASK 72 li r8, INITIAL_MASK 73 74#if USE_AS_STRCSPN 75 /* Ensure the null character never matches by clearing ISA bit 0 in 76 in r5 which is the bit which will check for it in the later usage 77 of vbpermq. */ 78 srdi r5, r5, 1 79#endif 80 81 li r11, 1 82 sldi r11, r11, 63 83 84 /* Start interleaved Mask computation. 85 This will eventually or 1's into ignored bits from vbpermq. */ 86 lvsr v11, 0, r3 87 vspltb v11, v11, 0 /* Splat shift constant. */ 88 89 /* Build a 256b mask in r5-r8. */ 90 .align 4 91L(next_needle): 92 lbzu r9, 1(r4) 93 94 cmpldi cr0, r9, 0 95 cmpldi cr1, r9, 128 96 97 /* This is a little tricky. srd only uses the first 7 bits, 98 and if bit 7 is set, value is always 0. So, we can 99 effectively shift 128b in this case. */ 100 xori r12, r9, 0x40 /* Invert bit 6. */ 101 srd r10, r11, r9 /* Mask for bits 0-63. */ 102 srd r12, r11, r12 /* Mask for bits 64-127. */ 103 104 beq cr0, L(start_cmp) 105 106 /* Now, or the value into the correct GPR. */ 107 bge cr1,L(needle_gt128) 108 UPDATE_MASK (r5, r5, r10) /* 0 - 63. */ 109 UPDATE_MASK (r6, r6, r12) /* 64 - 127. */ 110 b L(next_needle) 111 112 .align 4 113L(needle_gt128): 114 UPDATE_MASK (r7, r7, r10) /* 128 - 191. */ 115 UPDATE_MASK (r8, r8, r12) /* 192 - 255. */ 116 b L(next_needle) 117 118 119 .align 4 120L(start_cmp): 121 /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ 122 mr r0, r3 /* Save r3 for final length computation. */ 123 mtvrd v5, r5 124 mtvrd v6, r6 125 mtvrd v7, r7 126 mtvrd v8, r8 127 128 /* Continue interleaved mask generation. */ 129#ifdef __LITTLE_ENDIAN__ 130 vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ 131 vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ 132#else 133 vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ 134 vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ 135#endif 136 lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ 137 138 /* Do the merging of the bitmask. */ 139 XXVR(xxmrghd, v5, v5, v6) 140 XXVR(xxmrghd, v6, v7, v8) 141 142 /* Finish mask generation. */ 143 vand v11, v11, v4 /* Throwaway bits not in the mask. */ 144 145 /* Compare the first 1-16B, while masking unwanted bytes. */ 146 clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ 147 vxor v9, v0, v1 /* Swap high bit. */ 148 vbpermq v8, v5, v0 149 vbpermq v7, v6, v9 150 vor v7, v7, v8 151 vor v7, v7, v11 /* Ignore non-participating bytes. */ 152 vcmpequh. v8, v7, v4 153 bnl cr6, L(done) 154 155 addi r3, r3, 16 156 157 .align 4 158L(vec): 159 lvx v0, 0, r3 160 addi r3, r3, 16 161 vxor v9, v0, v1 /* Swap high bit. */ 162 vbpermq v8, v5, v0 163 vbpermq v7, v6, v9 164 vor v7, v7, v8 165 vcmpequh. v8, v7, v4 166 blt cr6, L(vec) 167 168 addi r3, r3, -16 169L(done): 170 subf r3, r0, r3 171 mfvrd r10, v7 172 173#ifdef __LITTLE_ENDIAN__ 174 addi r0, r10, 1 /* Count the trailing 1's. */ 175 andc r10, r10, r0 176 popcntd r10, r10 177#else 178 xori r10, r10, 0xffff /* Count leading 1's by inverting. */ 179 addi r3, r3, -48 /* Account for the extra leading zeros. */ 180 cntlzd r10, r10 181#endif 182 183 add r3, r3, r10 184 blr 185 186END(STRSPN) 187libc_hidden_builtin_def (STRSPN) 188