1/* Optimized strlen implementation for POWER10 LE. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* To reuse the code for rawmemchr, we have some extra steps compared to the 22 strlen implementation: 23 - Sum the initial value of r3 with the position at which the char was 24 found, to guarantee we return a pointer and not the length. 25 - In the main loop, subtract each byte by the char we are looking for, 26 so we can keep using vminub to quickly check 64B at once. */ 27#ifdef USE_AS_RAWMEMCHR 28# ifndef RAWMEMCHR 29# define FUNCNAME __rawmemchr 30# else 31# define FUNCNAME RAWMEMCHR 32# endif 33# define MCOUNT_NARGS 2 34# define VREG_ZERO v20 35# define OFF_START_LOOP 256 36# define RAWMEMCHR_SUBTRACT_VECTORS \ 37 vsububm v4,v4,v18; \ 38 vsububm v5,v5,v18; \ 39 vsububm v6,v6,v18; \ 40 vsububm v7,v7,v18; 41# define TAIL(vreg,increment) \ 42 vctzlsbb r4,vreg; \ 43 addi r4,r4,increment; \ 44 add r3,r5,r4; \ 45 blr 46 47#else /* strlen */ 48 49# ifndef STRLEN 50# define FUNCNAME __strlen 51# define DEFINE_STRLEN_HIDDEN_DEF 1 52# else 53# define FUNCNAME STRLEN 54# endif 55# define MCOUNT_NARGS 1 56# define VREG_ZERO v18 57# define OFF_START_LOOP 192 58# define TAIL(vreg,increment) \ 59 vctzlsbb r4,vreg; \ 60 subf r3,r3,r5; \ 61 addi r4,r4,increment; \ 62 add r3,r3,r4; \ 63 blr 64#endif /* USE_AS_RAWMEMCHR */ 65 66/* TODO: Replace macros by the actual instructions when minimum binutils becomes 67 >= 2.35. This is used to keep compatibility with older versions. */ 68#define VEXTRACTBM(rt,vrb) \ 69 .long(((4)<<(32-6)) \ 70 | ((rt)<<(32-11)) \ 71 | ((8)<<(32-16)) \ 72 | ((vrb)<<(32-21)) \ 73 | 1602) 74 75#define LXVP(xtp,dq,ra) \ 76 .long(((6)<<(32-6)) \ 77 | ((((xtp)-32)>>1)<<(32-10)) \ 78 | ((1)<<(32-11)) \ 79 | ((ra)<<(32-16)) \ 80 | dq) 81 82#define CHECK16(vreg,offset,addr,label) \ 83 lxv vreg+32,offset(addr); \ 84 vcmpequb. vreg,vreg,v18; \ 85 bne cr6,L(label); 86 87/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # 88 of bytes already checked. */ 89#define CHECK64(offset,addr,label) \ 90 li r6,offset; \ 91 LXVP(v4+32,offset,addr); \ 92 LXVP(v6+32,offset+32,addr); \ 93 RAWMEMCHR_SUBTRACT_VECTORS; \ 94 vminub v14,v4,v5; \ 95 vminub v15,v6,v7; \ 96 vminub v16,v14,v15; \ 97 vcmpequb. v0,v16,VREG_ZERO; \ 98 bne cr6,L(label) 99 100/* Implements the function 101 102 int [r3] strlen (const void *s [r3]) 103 104 but when USE_AS_RAWMEMCHR is set, implements the function 105 106 void* [r3] rawmemchr (const void *s [r3], int c [r4]) 107 108 The implementation can load bytes past a matching byte, but only 109 up to the next 64B boundary, so it never crosses a page. */ 110 111.machine power9 112 113ENTRY_TOCLESS (FUNCNAME, 4) 114 CALL_MCOUNT MCOUNT_NARGS 115 116#ifdef USE_AS_RAWMEMCHR 117 xori r5,r4,0xff 118 119 mtvsrd v18+32,r4 /* matching char in v18 */ 120 mtvsrd v19+32,r5 /* non matching char in v19 */ 121 122 vspltb v18,v18,7 /* replicate */ 123 vspltb v19,v19,7 /* replicate */ 124#else 125 vspltisb v19,-1 126#endif 127 vspltisb VREG_ZERO,0 128 129 /* Next 16B-aligned address. Prepare address for L(aligned). */ 130 addi r5,r3,16 131 clrrdi r5,r5,4 132 133 /* Align data and fill bytes not loaded with non matching char. */ 134 lvx v0,0,r3 135 lvsr v1,0,r3 136 vperm v0,v19,v0,v1 137 138 vcmpequb. v6,v0,v18 139 beq cr6,L(aligned) 140 141#ifdef USE_AS_RAWMEMCHR 142 vctzlsbb r6,v6 143 add r3,r3,r6 144#else 145 vctzlsbb r3,v6 146#endif 147 blr 148 149 /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is 150 optimized for longer strings, so checking the first bytes in 16B 151 chunks benefits a lot small strings. */ 152 .p2align 5 153L(aligned): 154#ifdef USE_AS_RAWMEMCHR 155 cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to 156 choose how we will perform the main loop. */ 157#endif 158 /* Prepare address for the loop. */ 159 addi r4,r3,OFF_START_LOOP 160 clrrdi r4,r4,6 161 162 CHECK16(v0,0,r5,tail1) 163 CHECK16(v1,16,r5,tail2) 164 CHECK16(v2,32,r5,tail3) 165 CHECK16(v3,48,r5,tail4) 166 CHECK16(v4,64,r5,tail5) 167 CHECK16(v5,80,r5,tail6) 168 CHECK16(v6,96,r5,tail7) 169 CHECK16(v7,112,r5,tail8) 170 CHECK16(v8,128,r5,tail9) 171 CHECK16(v9,144,r5,tail10) 172 CHECK16(v10,160,r5,tail11) 173#ifdef USE_AS_RAWMEMCHR 174 CHECK16(v0,176,r5,tail12) 175 CHECK16(v1,192,r5,tail13) 176 CHECK16(v2,208,r5,tail14) 177 CHECK16(v3,224,r5,tail15) 178#endif 179 180 addi r5,r4,128 181 182#ifdef USE_AS_RAWMEMCHR 183 /* If c == 0, use the same loop as strlen, without the vsububm. */ 184 beq cr5,L(loop) 185 186 /* This is very similar to the block after L(loop), the difference is 187 that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract 188 each byte loaded by the char we are looking for, this way we can keep 189 using vminub to merge the results and checking for nulls. */ 190 .p2align 5 191L(rawmemchr_loop): 192 CHECK64(0,r4,pre_tail_64b) 193 CHECK64(64,r4,pre_tail_64b) 194 addi r4,r4,256 195 196 CHECK64(0,r5,tail_64b) 197 CHECK64(64,r5,tail_64b) 198 addi r5,r5,256 199 200 b L(rawmemchr_loop) 201#endif 202 /* Switch to a more aggressive approach checking 64B each time. Use 2 203 pointers 128B apart and unroll the loop once to make the pointer 204 updates and usages separated enough to avoid stalls waiting for 205 address calculation. */ 206 .p2align 5 207L(loop): 208#undef RAWMEMCHR_SUBTRACT_VECTORS 209#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ 210 CHECK64(0,r4,pre_tail_64b) 211 CHECK64(64,r4,pre_tail_64b) 212 addi r4,r4,256 213 214 CHECK64(0,r5,tail_64b) 215 CHECK64(64,r5,tail_64b) 216 addi r5,r5,256 217 218 b L(loop) 219 220 .p2align 5 221L(pre_tail_64b): 222 mr r5,r4 223L(tail_64b): 224 /* OK, we found a null byte. Let's look for it in the current 64-byte 225 block and mark it in its corresponding VR. lxvp vx,0(ry) puts the 226 low 16B bytes into vx+1, and the high into vx, so the order here is 227 v5, v4, v7, v6. */ 228 vcmpequb v1,v5,VREG_ZERO 229 vcmpequb v2,v4,VREG_ZERO 230 vcmpequb v3,v7,VREG_ZERO 231 vcmpequb v4,v6,VREG_ZERO 232 233 /* Take into account the other 64B blocks we had already checked. */ 234 add r5,r5,r6 235 236 /* Extract first bit of each byte. */ 237 VEXTRACTBM(r7,v1) 238 VEXTRACTBM(r8,v2) 239 VEXTRACTBM(r9,v3) 240 VEXTRACTBM(r10,v4) 241 242 /* Shift each value into their corresponding position. */ 243 sldi r8,r8,16 244 sldi r9,r9,32 245 sldi r10,r10,48 246 247 /* Merge the results. */ 248 or r7,r7,r8 249 or r8,r9,r10 250 or r10,r8,r7 251 252 cnttzd r0,r10 /* Count trailing zeros before the match. */ 253#ifndef USE_AS_RAWMEMCHR 254 subf r5,r3,r5 255#endif 256 add r3,r5,r0 /* Compute final length. */ 257 blr 258 259 .p2align 5 260L(tail1): 261 TAIL(v0,0) 262 263 .p2align 5 264L(tail2): 265 TAIL(v1,16) 266 267 .p2align 5 268L(tail3): 269 TAIL(v2,32) 270 271 .p2align 5 272L(tail4): 273 TAIL(v3,48) 274 275 .p2align 5 276L(tail5): 277 TAIL(v4,64) 278 279 .p2align 5 280L(tail6): 281 TAIL(v5,80) 282 283 .p2align 5 284L(tail7): 285 TAIL(v6,96) 286 287 .p2align 5 288L(tail8): 289 TAIL(v7,112) 290 291 .p2align 5 292L(tail9): 293 TAIL(v8,128) 294 295 .p2align 5 296L(tail10): 297 TAIL(v9,144) 298 299 .p2align 5 300L(tail11): 301 TAIL(v10,160) 302 303#ifdef USE_AS_RAWMEMCHR 304 .p2align 5 305L(tail12): 306 TAIL(v0,176) 307 308 .p2align 5 309L(tail13): 310 TAIL(v1,192) 311 312 .p2align 5 313L(tail14): 314 TAIL(v2,208) 315 316 .p2align 5 317L(tail15): 318 TAIL(v3,224) 319#endif 320 321END (FUNCNAME) 322 323#ifdef USE_AS_RAWMEMCHR 324weak_alias (__rawmemchr,rawmemchr) 325libc_hidden_builtin_def (__rawmemchr) 326#else 327# ifdef DEFINE_STRLEN_HIDDEN_DEF 328weak_alias (__strlen, strlen) 329libc_hidden_builtin_def (strlen) 330# endif 331#endif 332