1/* Optimized strlen implementation for PowerPC64. 2 Copyright (C) 1997-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* The algorithm here uses the following techniques: 22 23 1) Given a word 'x', we can test to see if it contains any 0 bytes 24 by subtracting 0x01010101, and seeing if any of the high bits of each 25 byte changed from 0 to 1. This works because the least significant 26 0 byte must have had no incoming carry (otherwise it's not the least 27 significant), so it is 0x00 - 0x01 == 0xff. For all other 28 byte values, either they have the high bit set initially, or when 29 1 is subtracted you get a value in the range 0x00-0x7f, none of which 30 have their high bit set. The expression here is 31 (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when 32 there were no 0x00 bytes in the word. You get 0x80 in bytes that 33 match, but possibly false 0x80 matches in the next more significant 34 byte to a true match due to carries. For little-endian this is 35 of no consequence since the least significant match is the one 36 we're interested in, but big-endian needs method 2 to find which 37 byte matches. 38 39 2) Given a word 'x', we can test to see _which_ byte was zero by 40 calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). 41 This produces 0x80 in each byte that was zero, and 0x00 in all 42 the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each 43 byte, and the '| x' part ensures that bytes with the high bit set 44 produce 0x00. The addition will carry into the high bit of each byte 45 iff that byte had one of its low 7 bits set. We can then just see 46 which was the most significant bit set and divide by 8 to find how 47 many to add to the index. 48 This is from the book 'The PowerPC Compiler Writer's Guide', 49 by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren. 50 51 We deal with strings not aligned to a word boundary by taking the 52 first word and ensuring that bytes not part of the string 53 are treated as nonzero. To allow for memory latency, we unroll the 54 loop a few times, being careful to ensure that we do not read ahead 55 across cache line boundaries. 56 57 Questions to answer: 58 1) How long are strings passed to strlen? If they're often really long, 59 we should probably use cache management instructions and/or unroll the 60 loop more. If they're often quite short, it might be better to use 61 fact (2) in the inner loop than have to recalculate it. 62 2) How popular are bytes with the high bit set? If they are very rare, 63 on some processors it might be useful to use the simpler expression 64 ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one 65 ALU), but this fails when any character has its high bit set. 66 67 Answer: 68 1) Added a Data Cache Block Touch early to prefetch the first 128 69 byte cache line. Adding dcbt instructions to the loop would not be 70 effective since most strings will be shorter than the cache line. */ 71 72/* Some notes on register usage: Under the SVR4 ABI, we can use registers 73 0 and 3 through 12 (so long as we don't call any procedures) without 74 saving them. We can also use registers 14 through 31 if we save them. 75 We can't use r1 (it's the stack pointer), r2 nor r13 because the user 76 program may expect them to hold their usual value if we get sent 77 a signal. Integer parameters are passed in r3 through r10. 78 We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving 79 them, the others we must save. */ 80 81/* int [r3] strlen (char *s [r3]) */ 82 83#ifndef STRLEN 84# define STRLEN strlen 85#endif 86 87ENTRY_TOCLESS (STRLEN) 88 CALL_MCOUNT 1 89 90#define rTMP4 r0 91#define rRTN r3 /* incoming STR arg, outgoing result */ 92#define rSTR r4 /* current string position */ 93#define rPADN r5 /* number of padding bits we prepend to the 94 string to make it start at a word boundary */ 95#define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ 96#define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */ 97#define rWORD1 r8 /* current string doubleword */ 98#define rWORD2 r9 /* next string doubleword */ 99#define rMASK r9 /* mask for first string doubleword */ 100#define rTMP1 r10 101#define rTMP2 r11 102#define rTMP3 r12 103 104 dcbt 0,rRTN 105 clrrdi rSTR, rRTN, 3 106 lis r7F7F, 0x7f7f 107 rlwinm rPADN, rRTN, 3, 26, 28 108 ld rWORD1, 0(rSTR) 109 addi r7F7F, r7F7F, 0x7f7f 110 li rMASK, -1 111 insrdi r7F7F, r7F7F, 32, 0 112/* We use method (2) on the first two doublewords, because rFEFE isn't 113 required which reduces setup overhead. Also gives a faster return 114 for small strings on big-endian due to needing to recalculate with 115 method (2) anyway. */ 116#ifdef __LITTLE_ENDIAN__ 117 sld rMASK, rMASK, rPADN 118#else 119 srd rMASK, rMASK, rPADN 120#endif 121 and rTMP1, r7F7F, rWORD1 122 or rTMP2, r7F7F, rWORD1 123 lis rFEFE, -0x101 124 add rTMP1, rTMP1, r7F7F 125 addi rFEFE, rFEFE, -0x101 126 nor rTMP3, rTMP2, rTMP1 127 and. rTMP3, rTMP3, rMASK 128 mtcrf 0x01, rRTN 129 bne L(done0) 130 sldi rTMP1, rFEFE, 32 131 add rFEFE, rFEFE, rTMP1 132/* Are we now aligned to a doubleword boundary? */ 133 bt 28, L(loop) 134 135/* Handle second doubleword of pair. */ 136/* Perhaps use method (1) here for little-endian, saving one instruction? */ 137 ldu rWORD1, 8(rSTR) 138 and rTMP1, r7F7F, rWORD1 139 or rTMP2, r7F7F, rWORD1 140 add rTMP1, rTMP1, r7F7F 141 nor. rTMP3, rTMP2, rTMP1 142 bne L(done0) 143 144/* The loop. */ 145 146L(loop): 147 ld rWORD1, 8(rSTR) 148 ldu rWORD2, 16(rSTR) 149 add rTMP1, rFEFE, rWORD1 150 nor rTMP2, r7F7F, rWORD1 151 and. rTMP1, rTMP1, rTMP2 152 add rTMP3, rFEFE, rWORD2 153 nor rTMP4, r7F7F, rWORD2 154 bne L(done1) 155 and. rTMP3, rTMP3, rTMP4 156 beq L(loop) 157 158#ifndef __LITTLE_ENDIAN__ 159 and rTMP1, r7F7F, rWORD2 160 add rTMP1, rTMP1, r7F7F 161 andc rTMP3, rTMP4, rTMP1 162 b L(done0) 163 164L(done1): 165 and rTMP1, r7F7F, rWORD1 166 subi rSTR, rSTR, 8 167 add rTMP1, rTMP1, r7F7F 168 andc rTMP3, rTMP2, rTMP1 169 170/* When we get to here, rSTR points to the first doubleword in the string that 171 contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00 172 otherwise. */ 173L(done0): 174 cntlzd rTMP3, rTMP3 175 subf rTMP1, rRTN, rSTR 176 srdi rTMP3, rTMP3, 3 177 add rRTN, rTMP1, rTMP3 178 blr 179#else 180 181L(done0): 182 addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */ 183 andc rTMP1, rTMP1, rTMP3 184 cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */ 185 subf rTMP3, rRTN, rSTR 186 subfic rTMP1, rTMP1, 64-7 187 srdi rTMP1, rTMP1, 3 188 add rRTN, rTMP1, rTMP3 189 blr 190 191L(done1): 192 addi rTMP3, rTMP1, -1 193 andc rTMP3, rTMP3, rTMP1 194 cntlzd rTMP3, rTMP3 195 subf rTMP1, rRTN, rSTR 196 subfic rTMP3, rTMP3, 64-7-64 197 sradi rTMP3, rTMP3, 3 198 add rRTN, rTMP1, rTMP3 199 blr 200#endif 201 202END (STRLEN) 203libc_hidden_builtin_def (strlen) 204