1/* Optimized strcpy implementation for PowerPC64/POWER9. 2 Copyright (C) 2020-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifdef USE_AS_STPCPY 22# ifndef STPCPY 23# define FUNC_NAME __stpcpy 24# else 25# define FUNC_NAME STPCPY 26# endif 27#else 28# ifndef STRCPY 29# define FUNC_NAME strcpy 30# else 31# define FUNC_NAME STRCPY 32# endif 33#endif /* !USE_AS_STPCPY */ 34 35/* Implements the function 36 37 char * [r3] strcpy (char *dest [r3], const char *src [r4]) 38 39 or 40 41 char * [r3] stpcpy (char *dest [r3], const char *src [r4]) 42 43 if USE_AS_STPCPY is defined. 44 45 The implementation can load bytes past a null terminator, but only 46 up to the next 16B boundary, so it never crosses a page. */ 47 48/* Load quadword at addr+offset to vreg, check for null bytes, 49 and branch to label if any are found. */ 50#define CHECK16(vreg,offset,addr,label) \ 51 lxv vreg+32,offset(addr); \ 52 vcmpequb. v6,vreg,v18; \ 53 bne cr6,L(label); 54 55.machine power9 56ENTRY_TOCLESS (FUNC_NAME, 4) 57 CALL_MCOUNT 2 58 59 vspltisb v18,0 /* Zeroes in v18 */ 60 vspltisb v19,-1 /* 0xFF bytes in v19 */ 61 62 /* Next 16B-aligned address. Prepare address for L(loop). */ 63 addi r5,r4,16 64 clrrdi r5,r5,4 65 subf r8,r4,r5 66 add r11,r3,r8 67 68 /* Align data and fill bytes not loaded with non matching char. */ 69 lvx v0,0,r4 70 lvsr v1,0,r4 71 vperm v0,v19,v0,v1 72 73 vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ 74 beq cr6,L(no_null) 75 76 /* There's a null byte. */ 77 vctzlsbb r8,v6 /* Number of trailing zeroes */ 78 addi r9,r8,1 /* Add null byte. */ 79 sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ 80 stxvl 32+v0,r3,r10 /* Partial store */ 81 82#ifdef USE_AS_STPCPY 83 /* stpcpy returns the dest address plus the size not counting the 84 final '\0'. */ 85 add r3,r3,r8 86#endif 87 blr 88 89L(no_null): 90 sldi r10,r8,56 /* stxvl wants size in top 8 bits */ 91 stxvl 32+v0,r3,r10 /* Partial store */ 92 93 .p2align 4 94L(loop): 95 CHECK16(v0,0,r5,tail1) 96 CHECK16(v1,16,r5,tail2) 97 CHECK16(v2,32,r5,tail3) 98 CHECK16(v3,48,r5,tail4) 99 CHECK16(v4,64,r5,tail5) 100 CHECK16(v5,80,r5,tail6) 101 102 stxv 32+v0,0(r11) 103 stxv 32+v1,16(r11) 104 stxv 32+v2,32(r11) 105 stxv 32+v3,48(r11) 106 stxv 32+v4,64(r11) 107 stxv 32+v5,80(r11) 108 109 addi r5,r5,96 110 addi r11,r11,96 111 112 b L(loop) 113 114 .p2align 4 115L(tail1): 116 vctzlsbb r8,v6 /* Number of trailing zeroes */ 117 addi r9,r8,1 /* Add null terminator */ 118 sldi r9,r9,56 /* stxvl wants size in top 8 bits */ 119 stxvl 32+v0,r11,r9 /* Partial store */ 120#ifdef USE_AS_STPCPY 121 /* stpcpy returns the dest address plus the size not counting the 122 final '\0'. */ 123 add r3,r11,r8 124#endif 125 blr 126 127 .p2align 4 128L(tail2): 129 stxv 32+v0,0(r11) 130 vctzlsbb r8,v6 131 addi r9,r8,1 132 sldi r9,r9,56 133 addi r11,r11,16 134 stxvl 32+v1,r11,r9 135#ifdef USE_AS_STPCPY 136 add r3,r11,r8 137#endif 138 blr 139 140 .p2align 4 141L(tail3): 142 stxv 32+v0,0(r11) 143 stxv 32+v1,16(r11) 144 vctzlsbb r8,v6 145 addi r9,r8,1 146 sldi r9,r9,56 147 addi r11,r11,32 148 stxvl 32+v2,r11,r9 149#ifdef USE_AS_STPCPY 150 add r3,r11,r8 151#endif 152 blr 153 154 .p2align 4 155L(tail4): 156 stxv 32+v0,0(r11) 157 stxv 32+v1,16(r11) 158 stxv 32+v2,32(r11) 159 vctzlsbb r8,v6 160 addi r9,r8,1 161 sldi r9,r9,56 162 addi r11,r11,48 163 stxvl 32+v3,r11,r9 164#ifdef USE_AS_STPCPY 165 add r3,r11,r8 166#endif 167 blr 168 169 .p2align 4 170L(tail5): 171 stxv 32+v0,0(r11) 172 stxv 32+v1,16(r11) 173 stxv 32+v2,32(r11) 174 stxv 32+v3,48(r11) 175 vctzlsbb r8,v6 176 addi r9,r8,1 177 sldi r9,r9,56 178 addi r11,r11,64 179 stxvl 32+v4,r11,r9 180#ifdef USE_AS_STPCPY 181 add r3,r11,r8 182#endif 183 blr 184 185 .p2align 4 186L(tail6): 187 stxv 32+v0,0(r11) 188 stxv 32+v1,16(r11) 189 stxv 32+v2,32(r11) 190 stxv 32+v3,48(r11) 191 stxv 32+v4,64(r11) 192 vctzlsbb r8,v6 193 addi r9,r8,1 194 sldi r9,r9,56 195 addi r11,r11,80 196 stxvl 32+v5,r11,r9 197#ifdef USE_AS_STPCPY 198 add r3,r11,r8 199#endif 200 blr 201 202END (FUNC_NAME) 203#ifndef USE_AS_STPCPY 204libc_hidden_builtin_def (strcpy) 205#endif 206