1/* Optimized strlen implementation for PowerPC64/POWER9. 2 Copyright (C) 2020-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef STRLEN 22# define STRLEN __strlen 23# define DEFINE_STRLEN_HIDDEN_DEF 1 24#endif 25 26/* Implements the function 27 28 int [r3] strlen (const void *s [r3]) 29 30 The implementation can load bytes past a matching byte, but only 31 up to the next 64B boundary, so it never crosses a page. */ 32 33.machine power9 34ENTRY_TOCLESS (STRLEN, 4) 35 CALL_MCOUNT 2 36 37 vspltisb v18,0 38 vspltisb v19,-1 39 40 neg r5,r3 41 rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ 42 43 /* Align data and fill bytes not loaded with non matching char. */ 44 lvx v0,0,r3 45 lvsr v1,0,r3 46 vperm v0,v19,v0,v1 47 48 vcmpequb. v6,v0,v18 49 beq cr6,L(aligned) 50 51 vctzlsbb r3,v6 52 blr 53 54 /* Test 64B 16B at a time. The 64B vector loop is optimized for 55 longer strings. Likewise, we check a multiple of 64B to avoid 56 breaking the alignment calculation below. */ 57L(aligned): 58 add r4,r3,r9 59 rldicl. r5,r4,60,62 /* Determine the number of 48B loops needed for 60 alignment to 64B. And test for zero. */ 61 62 lxv v0+32,0(r4) 63 vcmpequb. v6,v0,v18 64 bne cr6,L(tail1) 65 66 lxv v0+32,16(r4) 67 vcmpequb. v6,v0,v18 68 bne cr6,L(tail2) 69 70 lxv v0+32,32(r4) 71 vcmpequb. v6,v0,v18 72 bne cr6,L(tail3) 73 74 lxv v0+32,48(r4) 75 vcmpequb. v6,v0,v18 76 bne cr6,L(tail4) 77 addi r4,r4,64 78 79 /* Speculatively generate a fake 16B aligned address to generate the 80 vector byte constant 0,1,..,15 using lvsl during reduction. */ 81 li r0,0 82 83 /* Skip the alignment if already 64B aligned. */ 84 beq L(loop_64b) 85 mtctr r5 86 87 /* Test 48B per iteration until 64B aligned. */ 88 .p2align 5 89L(loop): 90 lxv v0+32,0(r4) 91 vcmpequb. v6,v0,v18 92 bne cr6,L(tail1) 93 94 lxv v0+32,16(r4) 95 vcmpequb. v6,v0,v18 96 bne cr6,L(tail2) 97 98 lxv v0+32,32(r4) 99 vcmpequb. v6,v0,v18 100 bne cr6,L(tail3) 101 102 addi r4,r4,48 103 bdnz L(loop) 104 105 .p2align 5 106L(loop_64b): 107 lxv v1+32,0(r4) /* Load 4 quadwords. */ 108 lxv v2+32,16(r4) 109 lxv v3+32,32(r4) 110 lxv v4+32,48(r4) 111 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 112 vminub v6,v3,v4 113 vminub v7,v5,v6 114 vcmpequb. v7,v7,v18 /* Check for NULLs. */ 115 addi r4,r4,64 /* Adjust address for the next iteration. */ 116 bne cr6,L(vmx_zero) 117 118 lxv v1+32,0(r4) /* Load 4 quadwords. */ 119 lxv v2+32,16(r4) 120 lxv v3+32,32(r4) 121 lxv v4+32,48(r4) 122 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 123 vminub v6,v3,v4 124 vminub v7,v5,v6 125 vcmpequb. v7,v7,v18 /* Check for NULLs. */ 126 addi r4,r4,64 /* Adjust address for the next iteration. */ 127 bne cr6,L(vmx_zero) 128 129 lxv v1+32,0(r4) /* Load 4 quadwords. */ 130 lxv v2+32,16(r4) 131 lxv v3+32,32(r4) 132 lxv v4+32,48(r4) 133 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 134 vminub v6,v3,v4 135 vminub v7,v5,v6 136 vcmpequb. v7,v7,v18 /* Check for NULLs. */ 137 addi r4,r4,64 /* Adjust address for the next iteration. */ 138 beq cr6,L(loop_64b) 139 140L(vmx_zero): 141 /* OK, we found a null byte. Let's look for it in the current 64-byte 142 block and mark it in its corresponding VR. */ 143 vcmpequb v1,v1,v18 144 vcmpequb v2,v2,v18 145 vcmpequb v3,v3,v18 146 vcmpequb v4,v4,v18 147 148 /* We will now 'compress' the result into a single doubleword, so it 149 can be moved to a GPR for the final calculation. First, we 150 generate an appropriate mask for vbpermq, so we can permute bits into 151 the first halfword. */ 152 vspltisb v10,3 153 lvsl v11,0,r0 154 vslb v10,v11,v10 155 156 /* Permute the first bit of each byte into bits 48-63. */ 157 vbpermq v1,v1,v10 158 vbpermq v2,v2,v10 159 vbpermq v3,v3,v10 160 vbpermq v4,v4,v10 161 162 /* Shift each component into its correct position for merging. */ 163 vsldoi v2,v2,v2,2 164 vsldoi v3,v3,v3,4 165 vsldoi v4,v4,v4,6 166 167 /* Merge the results and move to a GPR. */ 168 vor v1,v2,v1 169 vor v2,v3,v4 170 vor v4,v1,v2 171 mfvrd r10,v4 172 173 /* Adjust address to the begninning of the current 64-byte block. */ 174 addi r4,r4,-64 175 176 cnttzd r0,r10 /* Count trailing zeros before the match. */ 177 subf r5,r3,r4 178 add r3,r5,r0 /* Compute final length. */ 179 blr 180 181L(tail1): 182 vctzlsbb r0,v6 183 add r4,r4,r0 184 subf r3,r3,r4 185 blr 186 187L(tail2): 188 vctzlsbb r0,v6 189 add r4,r4,r0 190 addi r4,r4,16 191 subf r3,r3,r4 192 blr 193 194L(tail3): 195 vctzlsbb r0,v6 196 add r4,r4,r0 197 addi r4,r4,32 198 subf r3,r3,r4 199 blr 200 201L(tail4): 202 vctzlsbb r0,v6 203 add r4,r4,r0 204 addi r4,r4,48 205 subf r3,r3,r4 206 blr 207 208END (STRLEN) 209 210#ifdef DEFINE_STRLEN_HIDDEN_DEF 211weak_alias (__strlen, strlen) 212libc_hidden_builtin_def (strlen) 213#endif 214