1/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. 2 Copyright (C) 2015-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifdef USE_AS_STPNCPY 22# ifndef STPNCPY 23# define FUNC_NAME __stpncpy 24# else 25# define FUNC_NAME STPNCPY 26# endif 27#else 28# ifndef STRNCPY 29# define FUNC_NAME strncpy 30# else 31# define FUNC_NAME STRNCPY 32# endif 33#endif /* !USE_AS_STPNCPY */ 34 35#ifndef MEMSET 36/* For builds without IFUNC support, local calls should be made to internal 37 GLIBC symbol (created by libc_hidden_builtin_def). */ 38# ifdef SHARED 39# define MEMSET_is_local 40# define MEMSET __GI_memset 41# else 42# define MEMSET memset 43# endif 44#endif 45 46#define FRAMESIZE (FRAME_MIN_SIZE+48) 47 48/* Implements the function 49 50 char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) 51 52 or 53 54 char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) 55 56 if USE_AS_STPCPY is defined. 57 58 The implementation uses unaligned doubleword access to avoid specialized 59 code paths depending of data alignment. Although recent powerpc64 uses 60 64K as default, the page cross handling assumes minimum page size of 61 4k. */ 62 63 .machine power8 64#ifdef MEMSET_is_local 65ENTRY_TOCLESS (FUNC_NAME, 4) 66#else 67ENTRY (FUNC_NAME, 4) 68#endif 69 CALL_MCOUNT 3 70 71 /* Check if the [src]+15 will cross a 4K page by checking if the bit 72 indicating the page size changes. Basically: 73 74 uint64_t srcin = (uint64_t)src; 75 uint64_t ob = srcin & 4096UL; 76 uint64_t nb = (srcin+15UL) & 4096UL; 77 if (ob ^ nb) 78 goto pagecross; */ 79 80 addi r10,r4,16 81 rlwinm r9,r4,0,19,19 82 83 /* Save some non-volatile registers on the stack. */ 84 std r26,-48(r1) 85 std r27,-40(r1) 86 87 rlwinm r8,r10,0,19,19 88 89 std r28,-32(r1) 90 std r29,-24(r1) 91 92 cmpld cr7,r9,r8 93 94 std r30,-16(r1) 95 std r31,-8(r1) 96 97 /* Update CFI. */ 98 cfi_offset(r26, -48) 99 cfi_offset(r27, -40) 100 cfi_offset(r28, -32) 101 cfi_offset(r29, -24) 102 cfi_offset(r30, -16) 103 cfi_offset(r31, -8) 104 105 beq cr7,L(unaligned_lt_16) 106 rldicl r9,r4,0,61 107 subfic r8,r9,8 108 cmpld cr7,r5,r8 109 bgt cr7,L(pagecross) 110 111 /* At this points there is 1 to 15 bytes to check and write. Since it could 112 be either from first unaligned 16 bytes access or from bulk copy, the code 113 uses an unrolled byte read/write instead of trying to analyze the cmpb 114 results. */ 115L(short_path): 116 mr r9,r3 117L(short_path_1): 118 /* Return if there are no more bytes to be written. */ 119 cmpdi cr7,r5,0 120 beq cr7,L(short_path_loop_end_1) 121L(short_path_2): 122 /* Copy one char from src (r4) and write it to dest (r9). If it is the 123 end-of-string, start the null padding. Continue, otherwise. */ 124 lbz r10,0(r4) 125 cmpdi cr7,r10,0 126 stb r10,0(r9) 127 beq cr7,L(zero_pad_start_1) 128 /* If there are no more bytes to be written, return. */ 129 cmpdi cr0,r5,1 130 addi r8,r9,1 131 addi r6,r5,-1 132 beq cr0,L(short_path_loop_end_0) 133 /* Copy another char from src (r4) to dest (r9). Check again if it is 134 the end-of-string. If so, start the null padding. */ 135 lbz r10,1(r4) 136 cmpdi cr7,r10,0 137 stb r10,1(r9) 138 beq cr7,L(zero_pad_start_prepare_1) 139 /* Eagerly decrement r5 by 3, which is the number of bytes already 140 written, plus one write that will be performed later on. */ 141 addi r10,r5,-3 142 b L(short_path_loop_1) 143 144 .align 4 145L(short_path_loop): 146 /* At this point, the induction variable, r5, as well as the pointers 147 to dest and src (r9 and r4, respectivelly) have been updated. 148 149 Note: The registers r7 and r10 are induction variables derived from 150 r5. They are used to determine if the total number of writes has 151 been reached at every other write. 152 153 Copy one char from src (r4) and write it to dest (r9). If it is the 154 end-of-string, start the null padding. Continue, otherwise. */ 155 lbz r8,0(r4) 156 addi r7,r10,-2 157 cmpdi cr5,r8,0 158 stb r8,0(r9) 159 beq cr5,L(zero_pad_start_1) 160 beq cr7,L(short_path_loop_end_0) 161 /* Copy another char from src (r4) to dest (r9). Check again if it is 162 the end-of-string. If so, start the null padding. */ 163 lbz r8,1(r4) 164 cmpdi cr7,r8,0 165 stb r8,1(r9) 166 beq cr7,L(zero_pad_start) 167 mr r10,r7 168L(short_path_loop_1): 169 /* This block is reached after two chars have been already written to 170 dest. Nevertheless, r5 (the induction variable), r9 (the pointer to 171 dest), and r4 (the pointer to src) have not yet been updated. 172 173 At this point: 174 r5 holds the count of bytes yet to be written plus 2. 175 r9 points to the last two chars that were already written to dest. 176 r4 points to the last two chars that were already copied from src. 177 178 The algorithm continues by decrementing r5, the induction variable, 179 so that it reflects the last two writes. The pointers to dest (r9) 180 and to src (r4) are increment by two, for the same reason. 181 182 Note: Register r10 is another induction variable, derived from r5, 183 which determines if the total number of writes has been reached. */ 184 addic. r5,r5,-2 185 addi r9,r9,2 186 cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */ 187 addi r4,r4,2 188 addi r6,r9,1 189 bne cr0,L(short_path_loop) /* Check if the total number of writes 190 has been reached at every other 191 write. */ 192#ifdef USE_AS_STPNCPY 193 mr r3,r9 194 b L(short_path_loop_end) 195#endif 196 197L(short_path_loop_end_0): 198#ifdef USE_AS_STPNCPY 199 addi r3,r9,1 200 b L(short_path_loop_end) 201#endif 202L(short_path_loop_end_1): 203#ifdef USE_AS_STPNCPY 204 mr r3,r9 205#endif 206L(short_path_loop_end): 207 /* Restore non-volatile registers. */ 208 ld r26,-48(r1) 209 ld r27,-40(r1) 210 ld r28,-32(r1) 211 ld r29,-24(r1) 212 ld r30,-16(r1) 213 ld r31,-8(r1) 214 blr 215 216 /* This code pads the remainder of dest with NULL bytes. The algorithm 217 calculates the remaining size and calls memset. */ 218 .align 4 219L(zero_pad_start): 220 mr r5,r10 221 mr r9,r6 222L(zero_pad_start_1): 223 /* At this point: 224 - r5 holds the number of bytes that still have to be written to 225 dest. 226 - r9 points to the position, in dest, where the first null byte 227 will be written. 228 The above statements are true both when control reaches this label 229 from a branch or when falling through the previous lines. */ 230#ifndef USE_AS_STPNCPY 231 mr r30,r3 /* Save the return value of strncpy. */ 232#endif 233 /* Prepare the call to memset. */ 234 mr r3,r9 /* Pointer to the area to be zero-filled. */ 235 li r4,0 /* Byte to be written (zero). */ 236 237 /* We delayed the creation of the stack frame, as well as the saving of 238 the link register, because only at this point, we are sure that 239 doing so is actually needed. */ 240 241 /* Save the link register. */ 242 mflr r0 243 std r0,16(r1) 244 245 /* Create the stack frame. */ 246 stdu r1,-FRAMESIZE(r1) 247 cfi_adjust_cfa_offset(FRAMESIZE) 248 cfi_offset(lr, 16) 249 250 bl MEMSET 251#ifndef MEMSET_is_local 252 nop 253#endif 254 255 ld r0,FRAMESIZE+16(r1) 256 257#ifndef USE_AS_STPNCPY 258 mr r3,r30 /* Restore the return value of strncpy, i.e.: 259 dest. For stpncpy, the return value is the 260 same as return value of memset. */ 261#endif 262 263 /* Restore non-volatile registers and return. */ 264 ld r26,FRAMESIZE-48(r1) 265 ld r27,FRAMESIZE-40(r1) 266 ld r28,FRAMESIZE-32(r1) 267 ld r29,FRAMESIZE-24(r1) 268 ld r30,FRAMESIZE-16(r1) 269 ld r31,FRAMESIZE-8(r1) 270 /* Restore the stack frame. */ 271 addi r1,r1,FRAMESIZE 272 cfi_adjust_cfa_offset(-FRAMESIZE) 273 /* Restore the link register. */ 274 mtlr r0 275 cfi_restore(lr) 276 blr 277 278 /* The common case where [src]+16 will not cross a 4K page boundary. 279 In this case the code fast check the first 16 bytes by using doubleword 280 read/compares and update destiny if neither total size or null byte 281 is found in destiny. */ 282 .align 4 283L(unaligned_lt_16): 284 cmpldi cr7,r5,7 285 ble cr7,L(short_path) 286 ld r7,0(r4) 287 li r8,0 288 cmpb r8,r7,r8 289 cmpdi cr7,r8,0 290 bne cr7,L(short_path_prepare_2) 291 addi r6,r5,-8 292 std r7,0(r3) 293 addi r9,r3,8 294 cmpldi cr7,r6,7 295 addi r7,r4,8 296 ble cr7,L(short_path_prepare_1_1) 297 ld r4,8(r4) 298 cmpb r8,r4,r8 299 cmpdi cr7,r8,0 300 bne cr7,L(short_path_prepare_2_1) 301 std r4,8(r3) 302 addi r29,r3,16 303 addi r5,r5,-16 304 /* Neither the null byte was found or total length was reached, 305 align to 16 bytes and issue a bulk copy/compare. */ 306 b L(align_to_16b) 307 308 /* In the case of 4k page boundary cross, the algorithm first align 309 the address to a doubleword, calculate a mask based on alignment 310 to ignore the bytes and continue using doubleword. */ 311 .align 4 312L(pagecross): 313 rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ 314 li r6,-1 /* MASK = 0xffffffffffffffffUL. */ 315 sldi r9,r9,3 /* Calculate padding. */ 316 ld r7,0(r11) /* Load doubleword from memory. */ 317#ifdef __LITTLE_ENDIAN__ 318 sld r9,r6,r9 /* MASK = MASK << padding. */ 319#else 320 srd r9,r6,r9 /* MASK = MASK >> padding. */ 321#endif 322 orc r9,r7,r9 /* Mask bits that are not part of the 323 string. */ 324 li r7,0 325 cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ 326 cmpdi cr7,r9,0 327 bne cr7,L(short_path_prepare_2) 328 subf r8,r8,r5 /* Adjust total length. */ 329 cmpldi cr7,r8,8 /* Check if length was reached. */ 330 ble cr7,L(short_path_prepare_2) 331 332 /* For next checks we have aligned address, so we check for more 333 three doublewords to make sure we can read 16 unaligned bytes 334 to start the bulk copy with 16 aligned addresses. */ 335 ld r7,8(r11) 336 cmpb r9,r7,r9 337 cmpdi cr7,r9,0 338 bne cr7,L(short_path_prepare_2) 339 addi r7,r8,-8 340 cmpldi cr7,r7,8 341 ble cr7,L(short_path_prepare_2) 342 ld r7,16(r11) 343 cmpb r9,r7,r9 344 cmpdi cr7,r9,0 345 bne cr7,L(short_path_prepare_2) 346 addi r8,r8,-16 347 cmpldi cr7,r8,8 348 ble cr7,L(short_path_prepare_2) 349 ld r8,24(r11) 350 cmpb r9,r8,r9 351 cmpdi cr7,r9,0 352 bne cr7,L(short_path_prepare_2) 353 354 /* No null byte found in the 32 bytes readed and length not reached, 355 read source again using unaligned loads and store them. */ 356 ld r9,0(r4) 357 addi r29,r3,16 358 addi r5,r5,-16 359 std r9,0(r3) 360 ld r9,8(r4) 361 std r9,8(r3) 362 363 /* Align source to 16 bytes and adjust destiny and size. */ 364L(align_to_16b): 365 rldicl r9,r10,0,60 366 rldicr r28,r10,0,59 367 add r12,r5,r9 368 subf r29,r9,r29 369 370 /* The bulk read/compare/copy loads two doublewords, compare and merge 371 in a single register for speed. This is an attempt to speed up the 372 null-checking process for bigger strings. */ 373 374 cmpldi cr7,r12,15 375 ble cr7,L(short_path_prepare_1_2) 376 377 /* Main loop for large sizes, unrolled 2 times to get better use of 378 pipeline. */ 379 ld r8,0(28) 380 ld r10,8(28) 381 li r9,0 382 cmpb r7,r8,r9 383 cmpb r9,r10,r9 384 or. r6,r9,r7 385 bne cr0,L(short_path_prepare_2_3) 386 addi r5,r12,-16 387 addi r4,r28,16 388 std r8,0(r29) 389 std r10,8(r29) 390 cmpldi cr7,r5,15 391 addi r9,r29,16 392 ble cr7,L(short_path_1) 393 mr r11,r28 394 mr r6,r29 395 li r30,0 396 subfic r26,r4,48 397 subfic r27,r9,48 398 399 b L(loop_16b) 400 401 .align 4 402L(loop_start): 403 ld r31,0(r11) 404 ld r10,8(r11) 405 cmpb r0,r31,r7 406 cmpb r8,r10,r7 407 or. r7,r0,r8 408 addi r5,r5,-32 409 cmpldi cr7,r5,15 410 add r4,r4,r26 411 add r9,r9,r27 412 bne cr0,L(short_path_prepare_2_2) 413 add r4,r28,r4 414 std r31,0(r6) 415 add r9,r29,r9 416 std r10,8(r6) 417 ble cr7,L(short_path_1) 418 419L(loop_16b): 420 ld r10,16(r11) 421 ld r0,24(r11) 422 cmpb r8,r10,r30 423 cmpb r7,r0,r30 424 or. r7,r8,r7 425 addi r12,r12,-32 426 cmpldi cr7,r12,15 427 addi r11,r11,32 428 bne cr0,L(short_path_2) 429 std r10,16(r6) 430 addi r6,r6,32 431 std r0,-8(r6) 432 bgt cr7,L(loop_start) 433 434 mr r5,r12 435 mr r4,r11 436 mr r9,r6 437 b L(short_path_1) 438 439 .align 4 440L(short_path_prepare_1_1): 441 mr r5,r6 442 mr r4,r7 443 b L(short_path_1) 444L(short_path_prepare_1_2): 445 mr r5,r12 446 mr r4,r28 447 mr r9,r29 448 b L(short_path_1) 449L(short_path_prepare_2): 450 mr r9,r3 451 b L(short_path_2) 452L(short_path_prepare_2_1): 453 mr r5,r6 454 mr r4,r7 455 b L(short_path_2) 456L(short_path_prepare_2_2): 457 mr r5,r12 458 mr r4,r11 459 mr r9,r6 460 b L(short_path_2) 461L(short_path_prepare_2_3): 462 mr r5,r12 463 mr r4,r28 464 mr r9,r29 465 b L(short_path_2) 466L(zero_pad_start_prepare_1): 467 mr r5,r6 468 mr r9,r8 469 b L(zero_pad_start_1) 470END (FUNC_NAME) 471 472#ifndef USE_AS_STPNCPY 473libc_hidden_builtin_def (strncpy) 474#endif 475