1/* Optimized strcmp implementation for PowerPC32. 2 Copyright (C) 2003-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* int [r3] memcmp (const char *s1 [r3], 22 const char *s2 [r4], 23 size_t size [r5]) */ 24 25 .machine power4 26EALIGN (memcmp, 4, 0) 27 CALL_MCOUNT 28 29#define rRTN r3 30#define rSTR1 r3 /* first string arg */ 31#define rSTR2 r4 /* second string arg */ 32#define rN r5 /* max string length */ 33#define rWORD1 r6 /* current word in s1 */ 34#define rWORD2 r7 /* current word in s2 */ 35#define rWORD3 r8 /* next word in s1 */ 36#define rWORD4 r9 /* next word in s2 */ 37#define rWORD5 r10 /* next word in s1 */ 38#define rWORD6 r11 /* next word in s2 */ 39#define rWORD7 r30 /* next word in s1 */ 40#define rWORD8 r31 /* next word in s2 */ 41 42 xor r0, rSTR2, rSTR1 43 cmplwi cr6, rN, 0 44 cmplwi cr1, rN, 12 45 clrlwi. r0, r0, 30 46 clrlwi r12, rSTR1, 30 47 cmplwi cr5, r12, 0 48 beq- cr6, L(zeroLength) 49 dcbt 0, rSTR1 50 dcbt 0, rSTR2 51/* If less than 8 bytes or not aligned, use the unaligned 52 byte loop. */ 53 blt cr1, L(bytealigned) 54 stwu 1, -64(r1) 55 cfi_adjust_cfa_offset(64) 56 stw rWORD8, 48(r1) 57 stw rWORD7, 44(r1) 58 cfi_offset(rWORD8, (48-64)) 59 cfi_offset(rWORD7, (44-64)) 60 bne L(unaligned) 61/* At this point we know both strings have the same alignment and the 62 compare length is at least 8 bytes. r12 contains the low order 63 2 bits of rSTR1 and cr5 contains the result of the logical compare 64 of r12 to 0. If r12 == 0 then we are already word 65 aligned and can perform the word aligned loop. 66 67 Otherwise we know the two strings have the same alignment (but not 68 yet word aligned). So we force the string addresses to the next lower 69 word boundary and special case this first word using shift left to 70 eliminate bits preceding the first byte. Since we want to join the 71 normal (word aligned) compare loop, starting at the second word, 72 we need to adjust the length (rN) and special case the loop 73 versioning for the first word. This ensures that the loop count is 74 correct and the first word (shifted) is in the expected register pair. */ 75 .align 4 76L(samealignment): 77 clrrwi rSTR1, rSTR1, 2 78 clrrwi rSTR2, rSTR2, 2 79 beq cr5, L(Waligned) 80 add rN, rN, r12 81 slwi rWORD6, r12, 3 82 srwi r0, rN, 4 /* Divide by 16 */ 83 andi. r12, rN, 12 /* Get the word remainder */ 84#ifdef __LITTLE_ENDIAN__ 85 lwbrx rWORD1, 0, rSTR1 86 lwbrx rWORD2, 0, rSTR2 87 addi rSTR1, rSTR1, 4 88 addi rSTR2, rSTR2, 4 89#else 90 lwz rWORD1, 0(rSTR1) 91 lwz rWORD2, 0(rSTR2) 92#endif 93 cmplwi cr1, r12, 8 94 cmplwi cr7, rN, 16 95 clrlwi rN, rN, 30 96 beq L(dPs4) 97 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 98 bgt cr1, L(dPs3) 99 beq cr1, L(dPs2) 100 101/* Remainder is 4 */ 102 .align 3 103L(dsP1): 104 slw rWORD5, rWORD1, rWORD6 105 slw rWORD6, rWORD2, rWORD6 106 cmplw cr5, rWORD5, rWORD6 107 blt cr7, L(dP1x) 108/* Do something useful in this cycle since we have to branch anyway. */ 109#ifdef __LITTLE_ENDIAN__ 110 lwbrx rWORD1, 0, rSTR1 111 lwbrx rWORD2, 0, rSTR2 112 addi rSTR1, rSTR1, 4 113 addi rSTR2, rSTR2, 4 114#else 115 lwz rWORD1, 4(rSTR1) 116 lwz rWORD2, 4(rSTR2) 117#endif 118 cmplw cr7, rWORD1, rWORD2 119 b L(dP1e) 120/* Remainder is 8 */ 121 .align 4 122L(dPs2): 123 slw rWORD5, rWORD1, rWORD6 124 slw rWORD6, rWORD2, rWORD6 125 cmplw cr6, rWORD5, rWORD6 126 blt cr7, L(dP2x) 127/* Do something useful in this cycle since we have to branch anyway. */ 128#ifdef __LITTLE_ENDIAN__ 129 lwbrx rWORD7, 0, rSTR1 130 lwbrx rWORD8, 0, rSTR2 131 addi rSTR1, rSTR1, 4 132 addi rSTR2, rSTR2, 4 133#else 134 lwz rWORD7, 4(rSTR1) 135 lwz rWORD8, 4(rSTR2) 136#endif 137 cmplw cr5, rWORD7, rWORD8 138 b L(dP2e) 139/* Remainder is 12 */ 140 .align 4 141L(dPs3): 142 slw rWORD3, rWORD1, rWORD6 143 slw rWORD4, rWORD2, rWORD6 144 cmplw cr1, rWORD3, rWORD4 145 b L(dP3e) 146/* Count is a multiple of 16, remainder is 0 */ 147 .align 4 148L(dPs4): 149 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 150 slw rWORD1, rWORD1, rWORD6 151 slw rWORD2, rWORD2, rWORD6 152 cmplw cr7, rWORD1, rWORD2 153 b L(dP4e) 154 155/* At this point we know both strings are word aligned and the 156 compare length is at least 8 bytes. */ 157 .align 4 158L(Waligned): 159 andi. r12, rN, 12 /* Get the word remainder */ 160 srwi r0, rN, 4 /* Divide by 16 */ 161 cmplwi cr1, r12, 8 162 cmplwi cr7, rN, 16 163 clrlwi rN, rN, 30 164 beq L(dP4) 165 bgt cr1, L(dP3) 166 beq cr1, L(dP2) 167 168/* Remainder is 4 */ 169 .align 4 170L(dP1): 171 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 172/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early 173 (8-15 byte compare), we want to use only volatile registers. This 174 means we can avoid restoring non-volatile registers since we did not 175 change any on the early exit path. The key here is the non-early 176 exit path only cares about the condition code (cr5), not about which 177 register pair was used. */ 178#ifdef __LITTLE_ENDIAN__ 179 lwbrx rWORD5, 0, rSTR1 180 lwbrx rWORD6, 0, rSTR2 181 addi rSTR1, rSTR1, 4 182 addi rSTR2, rSTR2, 4 183#else 184 lwz rWORD5, 0(rSTR1) 185 lwz rWORD6, 0(rSTR2) 186#endif 187 cmplw cr5, rWORD5, rWORD6 188 blt cr7, L(dP1x) 189#ifdef __LITTLE_ENDIAN__ 190 lwbrx rWORD1, 0, rSTR1 191 lwbrx rWORD2, 0, rSTR2 192 addi rSTR1, rSTR1, 4 193 addi rSTR2, rSTR2, 4 194#else 195 lwz rWORD1, 4(rSTR1) 196 lwz rWORD2, 4(rSTR2) 197#endif 198 cmplw cr7, rWORD1, rWORD2 199L(dP1e): 200#ifdef __LITTLE_ENDIAN__ 201 lwbrx rWORD3, 0, rSTR1 202 lwbrx rWORD4, 0, rSTR2 203 addi rSTR1, rSTR1, 4 204 addi rSTR2, rSTR2, 4 205#else 206 lwz rWORD3, 8(rSTR1) 207 lwz rWORD4, 8(rSTR2) 208#endif 209 cmplw cr1, rWORD3, rWORD4 210#ifdef __LITTLE_ENDIAN__ 211 lwbrx rWORD5, 0, rSTR1 212 lwbrx rWORD6, 0, rSTR2 213 addi rSTR1, rSTR1, 4 214 addi rSTR2, rSTR2, 4 215#else 216 lwz rWORD5, 12(rSTR1) 217 lwz rWORD6, 12(rSTR2) 218#endif 219 cmplw cr6, rWORD5, rWORD6 220 bne cr5, L(dLcr5x) 221 bne cr7, L(dLcr7x) 222 223#ifdef __LITTLE_ENDIAN__ 224 lwbrx rWORD7, 0, rSTR1 225 lwbrx rWORD8, 0, rSTR2 226 addi rSTR1, rSTR1, 4 227 addi rSTR2, rSTR2, 4 228#else 229 lwzu rWORD7, 16(rSTR1) 230 lwzu rWORD8, 16(rSTR2) 231#endif 232 bne cr1, L(dLcr1) 233 cmplw cr5, rWORD7, rWORD8 234 bdnz L(dLoop) 235 bne cr6, L(dLcr6) 236 lwz rWORD7, 44(r1) 237 lwz rWORD8, 48(r1) 238 .align 3 239L(dP1x): 240 slwi. r12, rN, 3 241 bne cr5, L(dLcr5x) 242 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ 243 addi 1, 1, 64 244 cfi_adjust_cfa_offset(-64) 245 bne L(d00) 246 li rRTN, 0 247 blr 248 249/* Remainder is 8 */ 250 .align 4 251 cfi_adjust_cfa_offset(64) 252L(dP2): 253 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 254#ifdef __LITTLE_ENDIAN__ 255 lwbrx rWORD5, 0, rSTR1 256 lwbrx rWORD6, 0, rSTR2 257 addi rSTR1, rSTR1, 4 258 addi rSTR2, rSTR2, 4 259#else 260 lwz rWORD5, 0(rSTR1) 261 lwz rWORD6, 0(rSTR2) 262#endif 263 cmplw cr6, rWORD5, rWORD6 264 blt cr7, L(dP2x) 265#ifdef __LITTLE_ENDIAN__ 266 lwbrx rWORD7, 0, rSTR1 267 lwbrx rWORD8, 0, rSTR2 268 addi rSTR1, rSTR1, 4 269 addi rSTR2, rSTR2, 4 270#else 271 lwz rWORD7, 4(rSTR1) 272 lwz rWORD8, 4(rSTR2) 273#endif 274 cmplw cr5, rWORD7, rWORD8 275L(dP2e): 276#ifdef __LITTLE_ENDIAN__ 277 lwbrx rWORD1, 0, rSTR1 278 lwbrx rWORD2, 0, rSTR2 279 addi rSTR1, rSTR1, 4 280 addi rSTR2, rSTR2, 4 281#else 282 lwz rWORD1, 8(rSTR1) 283 lwz rWORD2, 8(rSTR2) 284#endif 285 cmplw cr7, rWORD1, rWORD2 286#ifdef __LITTLE_ENDIAN__ 287 lwbrx rWORD3, 0, rSTR1 288 lwbrx rWORD4, 0, rSTR2 289 addi rSTR1, rSTR1, 4 290 addi rSTR2, rSTR2, 4 291#else 292 lwz rWORD3, 12(rSTR1) 293 lwz rWORD4, 12(rSTR2) 294#endif 295 cmplw cr1, rWORD3, rWORD4 296#ifndef __LITTLE_ENDIAN__ 297 addi rSTR1, rSTR1, 4 298 addi rSTR2, rSTR2, 4 299#endif 300 bne cr6, L(dLcr6) 301 bne cr5, L(dLcr5) 302 b L(dLoop2) 303/* Again we are on a early exit path (16-23 byte compare), we want to 304 only use volatile registers and avoid restoring non-volatile 305 registers. */ 306 .align 4 307L(dP2x): 308#ifdef __LITTLE_ENDIAN__ 309 lwbrx rWORD3, 0, rSTR1 310 lwbrx rWORD4, 0, rSTR2 311 addi rSTR1, rSTR1, 4 312 addi rSTR2, rSTR2, 4 313#else 314 lwz rWORD3, 4(rSTR1) 315 lwz rWORD4, 4(rSTR2) 316#endif 317 cmplw cr1, rWORD3, rWORD4 318 slwi. r12, rN, 3 319 bne cr6, L(dLcr6x) 320#ifndef __LITTLE_ENDIAN__ 321 addi rSTR1, rSTR1, 4 322 addi rSTR2, rSTR2, 4 323#endif 324 bne cr1, L(dLcr1x) 325 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ 326 addi 1, 1, 64 327 cfi_adjust_cfa_offset(-64) 328 bne L(d00) 329 li rRTN, 0 330 blr 331 332/* Remainder is 12 */ 333 .align 4 334 cfi_adjust_cfa_offset(64) 335L(dP3): 336 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 337#ifdef __LITTLE_ENDIAN__ 338 lwbrx rWORD3, 0, rSTR1 339 lwbrx rWORD4, 0, rSTR2 340 addi rSTR1, rSTR1, 4 341 addi rSTR2, rSTR2, 4 342#else 343 lwz rWORD3, 0(rSTR1) 344 lwz rWORD4, 0(rSTR2) 345#endif 346 cmplw cr1, rWORD3, rWORD4 347L(dP3e): 348#ifdef __LITTLE_ENDIAN__ 349 lwbrx rWORD5, 0, rSTR1 350 lwbrx rWORD6, 0, rSTR2 351 addi rSTR1, rSTR1, 4 352 addi rSTR2, rSTR2, 4 353#else 354 lwz rWORD5, 4(rSTR1) 355 lwz rWORD6, 4(rSTR2) 356#endif 357 cmplw cr6, rWORD5, rWORD6 358 blt cr7, L(dP3x) 359#ifdef __LITTLE_ENDIAN__ 360 lwbrx rWORD7, 0, rSTR1 361 lwbrx rWORD8, 0, rSTR2 362 addi rSTR1, rSTR1, 4 363 addi rSTR2, rSTR2, 4 364#else 365 lwz rWORD7, 8(rSTR1) 366 lwz rWORD8, 8(rSTR2) 367#endif 368 cmplw cr5, rWORD7, rWORD8 369#ifdef __LITTLE_ENDIAN__ 370 lwbrx rWORD1, 0, rSTR1 371 lwbrx rWORD2, 0, rSTR2 372 addi rSTR1, rSTR1, 4 373 addi rSTR2, rSTR2, 4 374#else 375 lwz rWORD1, 12(rSTR1) 376 lwz rWORD2, 12(rSTR2) 377#endif 378 cmplw cr7, rWORD1, rWORD2 379#ifndef __LITTLE_ENDIAN__ 380 addi rSTR1, rSTR1, 8 381 addi rSTR2, rSTR2, 8 382#endif 383 bne cr1, L(dLcr1) 384 bne cr6, L(dLcr6) 385 b L(dLoop1) 386/* Again we are on a early exit path (24-31 byte compare), we want to 387 only use volatile registers and avoid restoring non-volatile 388 registers. */ 389 .align 4 390L(dP3x): 391#ifdef __LITTLE_ENDIAN__ 392 lwbrx rWORD1, 0, rSTR1 393 lwbrx rWORD2, 0, rSTR2 394 addi rSTR1, rSTR1, 4 395 addi rSTR2, rSTR2, 4 396#else 397 lwz rWORD1, 8(rSTR1) 398 lwz rWORD2, 8(rSTR2) 399#endif 400 cmplw cr7, rWORD1, rWORD2 401 slwi. r12, rN, 3 402 bne cr1, L(dLcr1x) 403#ifndef __LITTLE_ENDIAN__ 404 addi rSTR1, rSTR1, 8 405 addi rSTR2, rSTR2, 8 406#endif 407 bne cr6, L(dLcr6x) 408 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ 409 bne cr7, L(dLcr7x) 410 addi 1, 1, 64 411 cfi_adjust_cfa_offset(-64) 412 bne L(d00) 413 li rRTN, 0 414 blr 415 416/* Count is a multiple of 16, remainder is 0 */ 417 .align 4 418 cfi_adjust_cfa_offset(64) 419L(dP4): 420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 421#ifdef __LITTLE_ENDIAN__ 422 lwbrx rWORD1, 0, rSTR1 423 lwbrx rWORD2, 0, rSTR2 424 addi rSTR1, rSTR1, 4 425 addi rSTR2, rSTR2, 4 426#else 427 lwz rWORD1, 0(rSTR1) 428 lwz rWORD2, 0(rSTR2) 429#endif 430 cmplw cr7, rWORD1, rWORD2 431L(dP4e): 432#ifdef __LITTLE_ENDIAN__ 433 lwbrx rWORD3, 0, rSTR1 434 lwbrx rWORD4, 0, rSTR2 435 addi rSTR1, rSTR1, 4 436 addi rSTR2, rSTR2, 4 437#else 438 lwz rWORD3, 4(rSTR1) 439 lwz rWORD4, 4(rSTR2) 440#endif 441 cmplw cr1, rWORD3, rWORD4 442#ifdef __LITTLE_ENDIAN__ 443 lwbrx rWORD5, 0, rSTR1 444 lwbrx rWORD6, 0, rSTR2 445 addi rSTR1, rSTR1, 4 446 addi rSTR2, rSTR2, 4 447#else 448 lwz rWORD5, 8(rSTR1) 449 lwz rWORD6, 8(rSTR2) 450#endif 451 cmplw cr6, rWORD5, rWORD6 452#ifdef __LITTLE_ENDIAN__ 453 lwbrx rWORD7, 0, rSTR1 454 lwbrx rWORD8, 0, rSTR2 455 addi rSTR1, rSTR1, 4 456 addi rSTR2, rSTR2, 4 457#else 458 lwzu rWORD7, 12(rSTR1) 459 lwzu rWORD8, 12(rSTR2) 460#endif 461 cmplw cr5, rWORD7, rWORD8 462 bne cr7, L(dLcr7) 463 bne cr1, L(dLcr1) 464 bdz- L(d24) /* Adjust CTR as we start with +4 */ 465/* This is the primary loop */ 466 .align 4 467L(dLoop): 468#ifdef __LITTLE_ENDIAN__ 469 lwbrx rWORD1, 0, rSTR1 470 lwbrx rWORD2, 0, rSTR2 471 addi rSTR1, rSTR1, 4 472 addi rSTR2, rSTR2, 4 473#else 474 lwz rWORD1, 4(rSTR1) 475 lwz rWORD2, 4(rSTR2) 476#endif 477 cmplw cr1, rWORD3, rWORD4 478 bne cr6, L(dLcr6) 479L(dLoop1): 480#ifdef __LITTLE_ENDIAN__ 481 lwbrx rWORD3, 0, rSTR1 482 lwbrx rWORD4, 0, rSTR2 483 addi rSTR1, rSTR1, 4 484 addi rSTR2, rSTR2, 4 485#else 486 lwz rWORD3, 8(rSTR1) 487 lwz rWORD4, 8(rSTR2) 488#endif 489 cmplw cr6, rWORD5, rWORD6 490 bne cr5, L(dLcr5) 491L(dLoop2): 492#ifdef __LITTLE_ENDIAN__ 493 lwbrx rWORD5, 0, rSTR1 494 lwbrx rWORD6, 0, rSTR2 495 addi rSTR1, rSTR1, 4 496 addi rSTR2, rSTR2, 4 497#else 498 lwz rWORD5, 12(rSTR1) 499 lwz rWORD6, 12(rSTR2) 500#endif 501 cmplw cr5, rWORD7, rWORD8 502 bne cr7, L(dLcr7) 503L(dLoop3): 504#ifdef __LITTLE_ENDIAN__ 505 lwbrx rWORD7, 0, rSTR1 506 lwbrx rWORD8, 0, rSTR2 507 addi rSTR1, rSTR1, 4 508 addi rSTR2, rSTR2, 4 509#else 510 lwzu rWORD7, 16(rSTR1) 511 lwzu rWORD8, 16(rSTR2) 512#endif 513 bne- cr1, L(dLcr1) 514 cmplw cr7, rWORD1, rWORD2 515 bdnz+ L(dLoop) 516 517L(dL4): 518 cmplw cr1, rWORD3, rWORD4 519 bne cr6, L(dLcr6) 520 cmplw cr6, rWORD5, rWORD6 521 bne cr5, L(dLcr5) 522 cmplw cr5, rWORD7, rWORD8 523L(d44): 524 bne cr7, L(dLcr7) 525L(d34): 526 bne cr1, L(dLcr1) 527L(d24): 528 bne cr6, L(dLcr6) 529L(d14): 530 slwi. r12, rN, 3 531 bne cr5, L(dLcr5) 532L(d04): 533 lwz rWORD7, 44(r1) 534 lwz rWORD8, 48(r1) 535 addi 1, 1, 64 536 cfi_adjust_cfa_offset(-64) 537 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ 538 beq L(zeroLength) 539/* At this point we have a remainder of 1 to 3 bytes to compare. Since 540 we are aligned it is safe to load the whole word, and use 541 shift right to eliminate bits beyond the compare length. */ 542L(d00): 543#ifdef __LITTLE_ENDIAN__ 544 lwbrx rWORD1, 0, rSTR1 545 lwbrx rWORD2, 0, rSTR2 546 addi rSTR1, rSTR1, 4 547 addi rSTR2, rSTR2, 4 548#else 549 lwz rWORD1, 4(rSTR1) 550 lwz rWORD2, 4(rSTR2) 551#endif 552 srw rWORD1, rWORD1, rN 553 srw rWORD2, rWORD2, rN 554 sub rRTN, rWORD1, rWORD2 555 blr 556 557 .align 4 558 cfi_adjust_cfa_offset(64) 559L(dLcr7): 560 lwz rWORD7, 44(r1) 561 lwz rWORD8, 48(r1) 562L(dLcr7x): 563 li rRTN, 1 564 addi 1, 1, 64 565 cfi_adjust_cfa_offset(-64) 566 bgtlr cr7 567 li rRTN, -1 568 blr 569 .align 4 570 cfi_adjust_cfa_offset(64) 571L(dLcr1): 572 lwz rWORD7, 44(r1) 573 lwz rWORD8, 48(r1) 574L(dLcr1x): 575 li rRTN, 1 576 addi 1, 1, 64 577 cfi_adjust_cfa_offset(-64) 578 bgtlr cr1 579 li rRTN, -1 580 blr 581 .align 4 582 cfi_adjust_cfa_offset(64) 583L(dLcr6): 584 lwz rWORD7, 44(r1) 585 lwz rWORD8, 48(r1) 586L(dLcr6x): 587 li rRTN, 1 588 addi 1, 1, 64 589 cfi_adjust_cfa_offset(-64) 590 bgtlr cr6 591 li rRTN, -1 592 blr 593 .align 4 594 cfi_adjust_cfa_offset(64) 595L(dLcr5): 596 lwz rWORD7, 44(r1) 597 lwz rWORD8, 48(r1) 598L(dLcr5x): 599 li rRTN, 1 600 addi 1, 1, 64 601 cfi_adjust_cfa_offset(-64) 602 bgtlr cr5 603 li rRTN, -1 604 blr 605 606 .align 4 607L(bytealigned): 608 mtctr rN /* Power4 wants mtctr 1st in dispatch group */ 609 610/* We need to prime this loop. This loop is swing modulo scheduled 611 to avoid pipe delays. The dependent instruction latencies (load to 612 compare to conditional branch) is 2 to 3 cycles. In this loop each 613 dispatch group ends in a branch and takes 1 cycle. Effectively 614 the first iteration of the loop only serves to load operands and 615 branches based on compares are delayed until the next loop. 616 617 So we must precondition some registers and condition codes so that 618 we don't exit the loop early on the first iteration. */ 619 620 lbz rWORD1, 0(rSTR1) 621 lbz rWORD2, 0(rSTR2) 622 bdz- L(b11) 623 cmplw cr7, rWORD1, rWORD2 624 lbz rWORD3, 1(rSTR1) 625 lbz rWORD4, 1(rSTR2) 626 bdz- L(b12) 627 cmplw cr1, rWORD3, rWORD4 628 lbzu rWORD5, 2(rSTR1) 629 lbzu rWORD6, 2(rSTR2) 630 bdz- L(b13) 631 .align 4 632L(bLoop): 633 lbzu rWORD1, 1(rSTR1) 634 lbzu rWORD2, 1(rSTR2) 635 bne- cr7, L(bLcr7) 636 637 cmplw cr6, rWORD5, rWORD6 638 bdz- L(b3i) 639 640 lbzu rWORD3, 1(rSTR1) 641 lbzu rWORD4, 1(rSTR2) 642 bne- cr1, L(bLcr1) 643 644 cmplw cr7, rWORD1, rWORD2 645 bdz- L(b2i) 646 647 lbzu rWORD5, 1(rSTR1) 648 lbzu rWORD6, 1(rSTR2) 649 bne- cr6, L(bLcr6) 650 651 cmplw cr1, rWORD3, rWORD4 652 bdnz+ L(bLoop) 653 654/* We speculatively loading bytes before we have tested the previous 655 bytes. But we must avoid overrunning the length (in the ctr) to 656 prevent these speculative loads from causing a segfault. In this 657 case the loop will exit early (before the all pending bytes are 658 tested. In this case we must complete the pending operations 659 before returning. */ 660L(b1i): 661 bne- cr7, L(bLcr7) 662 bne- cr1, L(bLcr1) 663 b L(bx56) 664 .align 4 665L(b2i): 666 bne- cr6, L(bLcr6) 667 bne- cr7, L(bLcr7) 668 b L(bx34) 669 .align 4 670L(b3i): 671 bne- cr1, L(bLcr1) 672 bne- cr6, L(bLcr6) 673 b L(bx12) 674 .align 4 675L(bLcr7): 676 li rRTN, 1 677 bgtlr cr7 678 li rRTN, -1 679 blr 680L(bLcr1): 681 li rRTN, 1 682 bgtlr cr1 683 li rRTN, -1 684 blr 685L(bLcr6): 686 li rRTN, 1 687 bgtlr cr6 688 li rRTN, -1 689 blr 690 691L(b13): 692 bne- cr7, L(bx12) 693 bne- cr1, L(bx34) 694L(bx56): 695 sub rRTN, rWORD5, rWORD6 696 blr 697 nop 698L(b12): 699 bne- cr7, L(bx12) 700L(bx34): 701 sub rRTN, rWORD3, rWORD4 702 blr 703L(b11): 704L(bx12): 705 sub rRTN, rWORD1, rWORD2 706 blr 707 .align 4 708L(zeroLength): 709 li rRTN, 0 710 blr 711 712 .align 4 713/* At this point we know the strings have different alignment and the 714 compare length is at least 8 bytes. r12 contains the low order 715 2 bits of rSTR1 and cr5 contains the result of the logical compare 716 of r12 to 0. If r12 == 0 then rStr1 is word aligned and can 717 perform the Wunaligned loop. 718 719 Otherwise we know that rSTR1 is not already word aligned yet. 720 So we can force the string addresses to the next lower word 721 boundary and special case this first word using shift left to 722 eliminate bits preceding the first byte. Since we want to join the 723 normal (Wualigned) compare loop, starting at the second word, 724 we need to adjust the length (rN) and special case the loop 725 versioning for the first W. This ensures that the loop count is 726 correct and the first W (shifted) is in the expected resister pair. */ 727#define rSHL r29 /* Unaligned shift left count. */ 728#define rSHR r28 /* Unaligned shift right count. */ 729#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ 730#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ 731#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ 732#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ 733 cfi_adjust_cfa_offset(64) 734L(unaligned): 735 stw rSHL, 40(r1) 736 cfi_offset(rSHL, (40-64)) 737 clrlwi rSHL, rSTR2, 30 738 stw rSHR, 36(r1) 739 cfi_offset(rSHR, (36-64)) 740 beq cr5, L(Wunaligned) 741 stw rWORD8_SHIFT, 32(r1) 742 cfi_offset(rWORD8_SHIFT, (32-64)) 743/* Adjust the logical start of rSTR2 to compensate for the extra bits 744 in the 1st rSTR1 W. */ 745 sub rWORD8_SHIFT, rSTR2, r12 746/* But do not attempt to address the W before that W that contains 747 the actual start of rSTR2. */ 748 clrrwi rSTR2, rSTR2, 2 749 stw rWORD2_SHIFT, 28(r1) 750/* Compute the left/right shift counts for the unaligned rSTR2, 751 compensating for the logical (W aligned) start of rSTR1. */ 752 clrlwi rSHL, rWORD8_SHIFT, 30 753 clrrwi rSTR1, rSTR1, 2 754 stw rWORD4_SHIFT, 24(r1) 755 slwi rSHL, rSHL, 3 756 cmplw cr5, rWORD8_SHIFT, rSTR2 757 add rN, rN, r12 758 slwi rWORD6, r12, 3 759 stw rWORD6_SHIFT, 20(r1) 760 cfi_offset(rWORD2_SHIFT, (28-64)) 761 cfi_offset(rWORD4_SHIFT, (24-64)) 762 cfi_offset(rWORD6_SHIFT, (20-64)) 763 subfic rSHR, rSHL, 32 764 srwi r0, rN, 4 /* Divide by 16 */ 765 andi. r12, rN, 12 /* Get the W remainder */ 766/* We normally need to load 2 Ws to start the unaligned rSTR2, but in 767 this special case those bits may be discarded anyway. Also we 768 must avoid loading a W where none of the bits are part of rSTR2 as 769 this may cross a page boundary and cause a page fault. */ 770 li rWORD8, 0 771 blt cr5, L(dus0) 772#ifdef __LITTLE_ENDIAN__ 773 lwbrx rWORD8, 0, rSTR2 774 addi rSTR2, rSTR2, 4 775#else 776 lwz rWORD8, 0(rSTR2) 777 addi rSTR2, rSTR2, 4 778#endif 779 slw rWORD8, rWORD8, rSHL 780 781L(dus0): 782#ifdef __LITTLE_ENDIAN__ 783 lwbrx rWORD1, 0, rSTR1 784 lwbrx rWORD2, 0, rSTR2 785 addi rSTR1, rSTR1, 4 786 addi rSTR2, rSTR2, 4 787#else 788 lwz rWORD1, 0(rSTR1) 789 lwz rWORD2, 0(rSTR2) 790#endif 791 cmplwi cr1, r12, 8 792 cmplwi cr7, rN, 16 793 srw r12, rWORD2, rSHR 794 clrlwi rN, rN, 30 795 beq L(duPs4) 796 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 797 or rWORD8, r12, rWORD8 798 bgt cr1, L(duPs3) 799 beq cr1, L(duPs2) 800 801/* Remainder is 4 */ 802 .align 4 803L(dusP1): 804 slw rWORD8_SHIFT, rWORD2, rSHL 805 slw rWORD7, rWORD1, rWORD6 806 slw rWORD8, rWORD8, rWORD6 807 bge cr7, L(duP1e) 808/* At this point we exit early with the first word compare 809 complete and remainder of 0 to 3 bytes. See L(du14) for details on 810 how we handle the remaining bytes. */ 811 cmplw cr5, rWORD7, rWORD8 812 slwi. rN, rN, 3 813 bne cr5, L(duLcr5) 814 cmplw cr7, rN, rSHR 815 beq L(duZeroReturn) 816 li r0, 0 817 ble cr7, L(dutrim) 818#ifdef __LITTLE_ENDIAN__ 819 lwbrx rWORD2, 0, rSTR2 820 addi rSTR2, rSTR2, 4 821#else 822 lwz rWORD2, 4(rSTR2) 823#endif 824 srw r0, rWORD2, rSHR 825 b L(dutrim) 826/* Remainder is 8 */ 827 .align 4 828L(duPs2): 829 slw rWORD6_SHIFT, rWORD2, rSHL 830 slw rWORD5, rWORD1, rWORD6 831 slw rWORD6, rWORD8, rWORD6 832 b L(duP2e) 833/* Remainder is 12 */ 834 .align 4 835L(duPs3): 836 slw rWORD4_SHIFT, rWORD2, rSHL 837 slw rWORD3, rWORD1, rWORD6 838 slw rWORD4, rWORD8, rWORD6 839 b L(duP3e) 840/* Count is a multiple of 16, remainder is 0 */ 841 .align 4 842L(duPs4): 843 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 844 or rWORD8, r12, rWORD8 845 slw rWORD2_SHIFT, rWORD2, rSHL 846 slw rWORD1, rWORD1, rWORD6 847 slw rWORD2, rWORD8, rWORD6 848 b L(duP4e) 849 850/* At this point we know rSTR1 is word aligned and the 851 compare length is at least 8 bytes. */ 852 .align 4 853L(Wunaligned): 854 stw rWORD8_SHIFT, 32(r1) 855 clrrwi rSTR2, rSTR2, 2 856 stw rWORD2_SHIFT, 28(r1) 857 srwi r0, rN, 4 /* Divide by 16 */ 858 stw rWORD4_SHIFT, 24(r1) 859 andi. r12, rN, 12 /* Get the W remainder */ 860 stw rWORD6_SHIFT, 20(r1) 861 cfi_offset(rWORD8_SHIFT, (32-64)) 862 cfi_offset(rWORD2_SHIFT, (28-64)) 863 cfi_offset(rWORD4_SHIFT, (24-64)) 864 cfi_offset(rWORD6_SHIFT, (20-64)) 865 slwi rSHL, rSHL, 3 866#ifdef __LITTLE_ENDIAN__ 867 lwbrx rWORD6, 0, rSTR2 868 addi rSTR2, rSTR2, 4 869 lwbrx rWORD8, 0, rSTR2 870 addi rSTR2, rSTR2, 4 871#else 872 lwz rWORD6, 0(rSTR2) 873 lwzu rWORD8, 4(rSTR2) 874#endif 875 cmplwi cr1, r12, 8 876 cmplwi cr7, rN, 16 877 clrlwi rN, rN, 30 878 subfic rSHR, rSHL, 32 879 slw rWORD6_SHIFT, rWORD6, rSHL 880 beq L(duP4) 881 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 882 bgt cr1, L(duP3) 883 beq cr1, L(duP2) 884 885/* Remainder is 4 */ 886 .align 4 887L(duP1): 888 srw r12, rWORD8, rSHR 889#ifdef __LITTLE_ENDIAN__ 890 lwbrx rWORD7, 0, rSTR1 891 addi rSTR1, rSTR1, 4 892#else 893 lwz rWORD7, 0(rSTR1) 894#endif 895 slw rWORD8_SHIFT, rWORD8, rSHL 896 or rWORD8, r12, rWORD6_SHIFT 897 blt cr7, L(duP1x) 898L(duP1e): 899#ifdef __LITTLE_ENDIAN__ 900 lwbrx rWORD1, 0, rSTR1 901 lwbrx rWORD2, 0, rSTR2 902 addi rSTR1, rSTR1, 4 903 addi rSTR2, rSTR2, 4 904#else 905 lwz rWORD1, 4(rSTR1) 906 lwz rWORD2, 4(rSTR2) 907#endif 908 cmplw cr5, rWORD7, rWORD8 909 srw r0, rWORD2, rSHR 910 slw rWORD2_SHIFT, rWORD2, rSHL 911 or rWORD2, r0, rWORD8_SHIFT 912#ifdef __LITTLE_ENDIAN__ 913 lwbrx rWORD3, 0, rSTR1 914 lwbrx rWORD4, 0, rSTR2 915 addi rSTR1, rSTR1, 4 916 addi rSTR2, rSTR2, 4 917#else 918 lwz rWORD3, 8(rSTR1) 919 lwz rWORD4, 8(rSTR2) 920#endif 921 cmplw cr7, rWORD1, rWORD2 922 srw r12, rWORD4, rSHR 923 slw rWORD4_SHIFT, rWORD4, rSHL 924 bne cr5, L(duLcr5) 925 or rWORD4, r12, rWORD2_SHIFT 926#ifdef __LITTLE_ENDIAN__ 927 lwbrx rWORD5, 0, rSTR1 928 lwbrx rWORD6, 0, rSTR2 929 addi rSTR1, rSTR1, 4 930 addi rSTR2, rSTR2, 4 931#else 932 lwz rWORD5, 12(rSTR1) 933 lwz rWORD6, 12(rSTR2) 934#endif 935 cmplw cr1, rWORD3, rWORD4 936 srw r0, rWORD6, rSHR 937 slw rWORD6_SHIFT, rWORD6, rSHL 938 bne cr7, L(duLcr7) 939 or rWORD6, r0, rWORD4_SHIFT 940 cmplw cr6, rWORD5, rWORD6 941 b L(duLoop3) 942 .align 4 943/* At this point we exit early with the first word compare 944 complete and remainder of 0 to 3 bytes. See L(du14) for details on 945 how we handle the remaining bytes. */ 946L(duP1x): 947 cmplw cr5, rWORD7, rWORD8 948 slwi. rN, rN, 3 949 bne cr5, L(duLcr5) 950 cmplw cr7, rN, rSHR 951 beq L(duZeroReturn) 952 li r0, 0 953 ble cr7, L(dutrim) 954#ifdef __LITTLE_ENDIAN__ 955 lwbrx rWORD2, 0, rSTR2 956 addi rSTR2, rSTR2, 4 957#else 958 lwz rWORD2, 8(rSTR2) 959#endif 960 srw r0, rWORD2, rSHR 961 b L(dutrim) 962/* Remainder is 8 */ 963 .align 4 964L(duP2): 965 srw r0, rWORD8, rSHR 966#ifdef __LITTLE_ENDIAN__ 967 lwbrx rWORD5, 0, rSTR1 968 addi rSTR1, rSTR1, 4 969#else 970 lwz rWORD5, 0(rSTR1) 971#endif 972 or rWORD6, r0, rWORD6_SHIFT 973 slw rWORD6_SHIFT, rWORD8, rSHL 974L(duP2e): 975#ifdef __LITTLE_ENDIAN__ 976 lwbrx rWORD7, 0, rSTR1 977 lwbrx rWORD8, 0, rSTR2 978 addi rSTR1, rSTR1, 4 979 addi rSTR2, rSTR2, 4 980#else 981 lwz rWORD7, 4(rSTR1) 982 lwz rWORD8, 4(rSTR2) 983#endif 984 cmplw cr6, rWORD5, rWORD6 985 srw r12, rWORD8, rSHR 986 slw rWORD8_SHIFT, rWORD8, rSHL 987 or rWORD8, r12, rWORD6_SHIFT 988 blt cr7, L(duP2x) 989#ifdef __LITTLE_ENDIAN__ 990 lwbrx rWORD1, 0, rSTR1 991 lwbrx rWORD2, 0, rSTR2 992 addi rSTR1, rSTR1, 4 993 addi rSTR2, rSTR2, 4 994#else 995 lwz rWORD1, 8(rSTR1) 996 lwz rWORD2, 8(rSTR2) 997#endif 998 cmplw cr5, rWORD7, rWORD8 999 bne cr6, L(duLcr6) 1000 srw r0, rWORD2, rSHR 1001 slw rWORD2_SHIFT, rWORD2, rSHL 1002 or rWORD2, r0, rWORD8_SHIFT 1003#ifdef __LITTLE_ENDIAN__ 1004 lwbrx rWORD3, 0, rSTR1 1005 lwbrx rWORD4, 0, rSTR2 1006 addi rSTR1, rSTR1, 4 1007 addi rSTR2, rSTR2, 4 1008#else 1009 lwz rWORD3, 12(rSTR1) 1010 lwz rWORD4, 12(rSTR2) 1011#endif 1012 cmplw cr7, rWORD1, rWORD2 1013 bne cr5, L(duLcr5) 1014 srw r12, rWORD4, rSHR 1015 slw rWORD4_SHIFT, rWORD4, rSHL 1016 or rWORD4, r12, rWORD2_SHIFT 1017#ifndef __LITTLE_ENDIAN__ 1018 addi rSTR1, rSTR1, 4 1019 addi rSTR2, rSTR2, 4 1020#endif 1021 cmplw cr1, rWORD3, rWORD4 1022 b L(duLoop2) 1023 .align 4 1024L(duP2x): 1025 cmplw cr5, rWORD7, rWORD8 1026#ifndef __LITTLE_ENDIAN__ 1027 addi rSTR1, rSTR1, 4 1028 addi rSTR2, rSTR2, 4 1029#endif 1030 bne cr6, L(duLcr6) 1031 slwi. rN, rN, 3 1032 bne cr5, L(duLcr5) 1033 cmplw cr7, rN, rSHR 1034 beq L(duZeroReturn) 1035 li r0, 0 1036 ble cr7, L(dutrim) 1037#ifdef __LITTLE_ENDIAN__ 1038 lwbrx rWORD2, 0, rSTR2 1039 addi rSTR2, rSTR2, 4 1040#else 1041 lwz rWORD2, 4(rSTR2) 1042#endif 1043 srw r0, rWORD2, rSHR 1044 b L(dutrim) 1045 1046/* Remainder is 12 */ 1047 .align 4 1048L(duP3): 1049 srw r12, rWORD8, rSHR 1050#ifdef __LITTLE_ENDIAN__ 1051 lwbrx rWORD3, 0, rSTR1 1052 addi rSTR1, rSTR1, 4 1053#else 1054 lwz rWORD3, 0(rSTR1) 1055#endif 1056 slw rWORD4_SHIFT, rWORD8, rSHL 1057 or rWORD4, r12, rWORD6_SHIFT 1058L(duP3e): 1059#ifdef __LITTLE_ENDIAN__ 1060 lwbrx rWORD5, 0, rSTR1 1061 lwbrx rWORD6, 0, rSTR2 1062 addi rSTR1, rSTR1, 4 1063 addi rSTR2, rSTR2, 4 1064#else 1065 lwz rWORD5, 4(rSTR1) 1066 lwz rWORD6, 4(rSTR2) 1067#endif 1068 cmplw cr1, rWORD3, rWORD4 1069 srw r0, rWORD6, rSHR 1070 slw rWORD6_SHIFT, rWORD6, rSHL 1071 or rWORD6, r0, rWORD4_SHIFT 1072#ifdef __LITTLE_ENDIAN__ 1073 lwbrx rWORD7, 0, rSTR1 1074 lwbrx rWORD8, 0, rSTR2 1075 addi rSTR1, rSTR1, 4 1076 addi rSTR2, rSTR2, 4 1077#else 1078 lwz rWORD7, 8(rSTR1) 1079 lwz rWORD8, 8(rSTR2) 1080#endif 1081 cmplw cr6, rWORD5, rWORD6 1082 bne cr1, L(duLcr1) 1083 srw r12, rWORD8, rSHR 1084 slw rWORD8_SHIFT, rWORD8, rSHL 1085 or rWORD8, r12, rWORD6_SHIFT 1086 blt cr7, L(duP3x) 1087#ifdef __LITTLE_ENDIAN__ 1088 lwbrx rWORD1, 0, rSTR1 1089 lwbrx rWORD2, 0, rSTR2 1090 addi rSTR1, rSTR1, 4 1091 addi rSTR2, rSTR2, 4 1092#else 1093 lwz rWORD1, 12(rSTR1) 1094 lwz rWORD2, 12(rSTR2) 1095#endif 1096 cmplw cr5, rWORD7, rWORD8 1097 bne cr6, L(duLcr6) 1098 srw r0, rWORD2, rSHR 1099 slw rWORD2_SHIFT, rWORD2, rSHL 1100 or rWORD2, r0, rWORD8_SHIFT 1101#ifndef __LITTLE_ENDIAN__ 1102 addi rSTR1, rSTR1, 8 1103 addi rSTR2, rSTR2, 8 1104#endif 1105 cmplw cr7, rWORD1, rWORD2 1106 b L(duLoop1) 1107 .align 4 1108L(duP3x): 1109#ifndef __LITTLE_ENDIAN__ 1110 addi rSTR1, rSTR1, 8 1111 addi rSTR2, rSTR2, 8 1112#endif 1113#if 0 1114/* Huh? We've already branched on cr1! */ 1115 bne cr1, L(duLcr1) 1116#endif 1117 cmplw cr5, rWORD7, rWORD8 1118 bne cr6, L(duLcr6) 1119 slwi. rN, rN, 3 1120 bne cr5, L(duLcr5) 1121 cmplw cr7, rN, rSHR 1122 beq L(duZeroReturn) 1123 li r0, 0 1124 ble cr7, L(dutrim) 1125#ifdef __LITTLE_ENDIAN__ 1126 lwbrx rWORD2, 0, rSTR2 1127 addi rSTR2, rSTR2, 4 1128#else 1129 lwz rWORD2, 4(rSTR2) 1130#endif 1131 srw r0, rWORD2, rSHR 1132 b L(dutrim) 1133 1134/* Count is a multiple of 16, remainder is 0 */ 1135 .align 4 1136L(duP4): 1137 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 1138 srw r0, rWORD8, rSHR 1139#ifdef __LITTLE_ENDIAN__ 1140 lwbrx rWORD1, 0, rSTR1 1141 addi rSTR1, rSTR1, 4 1142#else 1143 lwz rWORD1, 0(rSTR1) 1144#endif 1145 slw rWORD2_SHIFT, rWORD8, rSHL 1146 or rWORD2, r0, rWORD6_SHIFT 1147L(duP4e): 1148#ifdef __LITTLE_ENDIAN__ 1149 lwbrx rWORD3, 0, rSTR1 1150 lwbrx rWORD4, 0, rSTR2 1151 addi rSTR1, rSTR1, 4 1152 addi rSTR2, rSTR2, 4 1153#else 1154 lwz rWORD3, 4(rSTR1) 1155 lwz rWORD4, 4(rSTR2) 1156#endif 1157 cmplw cr7, rWORD1, rWORD2 1158 srw r12, rWORD4, rSHR 1159 slw rWORD4_SHIFT, rWORD4, rSHL 1160 or rWORD4, r12, rWORD2_SHIFT 1161#ifdef __LITTLE_ENDIAN__ 1162 lwbrx rWORD5, 0, rSTR1 1163 lwbrx rWORD6, 0, rSTR2 1164 addi rSTR1, rSTR1, 4 1165 addi rSTR2, rSTR2, 4 1166#else 1167 lwz rWORD5, 8(rSTR1) 1168 lwz rWORD6, 8(rSTR2) 1169#endif 1170 cmplw cr1, rWORD3, rWORD4 1171 bne cr7, L(duLcr7) 1172 srw r0, rWORD6, rSHR 1173 slw rWORD6_SHIFT, rWORD6, rSHL 1174 or rWORD6, r0, rWORD4_SHIFT 1175#ifdef __LITTLE_ENDIAN__ 1176 lwbrx rWORD7, 0, rSTR1 1177 lwbrx rWORD8, 0, rSTR2 1178 addi rSTR1, rSTR1, 4 1179 addi rSTR2, rSTR2, 4 1180#else 1181 lwzu rWORD7, 12(rSTR1) 1182 lwzu rWORD8, 12(rSTR2) 1183#endif 1184 cmplw cr6, rWORD5, rWORD6 1185 bne cr1, L(duLcr1) 1186 srw r12, rWORD8, rSHR 1187 slw rWORD8_SHIFT, rWORD8, rSHL 1188 or rWORD8, r12, rWORD6_SHIFT 1189 cmplw cr5, rWORD7, rWORD8 1190 bdz- L(du24) /* Adjust CTR as we start with +4 */ 1191/* This is the primary loop */ 1192 .align 4 1193L(duLoop): 1194#ifdef __LITTLE_ENDIAN__ 1195 lwbrx rWORD1, 0, rSTR1 1196 lwbrx rWORD2, 0, rSTR2 1197 addi rSTR1, rSTR1, 4 1198 addi rSTR2, rSTR2, 4 1199#else 1200 lwz rWORD1, 4(rSTR1) 1201 lwz rWORD2, 4(rSTR2) 1202#endif 1203 cmplw cr1, rWORD3, rWORD4 1204 bne cr6, L(duLcr6) 1205 srw r0, rWORD2, rSHR 1206 slw rWORD2_SHIFT, rWORD2, rSHL 1207 or rWORD2, r0, rWORD8_SHIFT 1208L(duLoop1): 1209#ifdef __LITTLE_ENDIAN__ 1210 lwbrx rWORD3, 0, rSTR1 1211 lwbrx rWORD4, 0, rSTR2 1212 addi rSTR1, rSTR1, 4 1213 addi rSTR2, rSTR2, 4 1214#else 1215 lwz rWORD3, 8(rSTR1) 1216 lwz rWORD4, 8(rSTR2) 1217#endif 1218 cmplw cr6, rWORD5, rWORD6 1219 bne cr5, L(duLcr5) 1220 srw r12, rWORD4, rSHR 1221 slw rWORD4_SHIFT, rWORD4, rSHL 1222 or rWORD4, r12, rWORD2_SHIFT 1223L(duLoop2): 1224#ifdef __LITTLE_ENDIAN__ 1225 lwbrx rWORD5, 0, rSTR1 1226 lwbrx rWORD6, 0, rSTR2 1227 addi rSTR1, rSTR1, 4 1228 addi rSTR2, rSTR2, 4 1229#else 1230 lwz rWORD5, 12(rSTR1) 1231 lwz rWORD6, 12(rSTR2) 1232#endif 1233 cmplw cr5, rWORD7, rWORD8 1234 bne cr7, L(duLcr7) 1235 srw r0, rWORD6, rSHR 1236 slw rWORD6_SHIFT, rWORD6, rSHL 1237 or rWORD6, r0, rWORD4_SHIFT 1238L(duLoop3): 1239#ifdef __LITTLE_ENDIAN__ 1240 lwbrx rWORD7, 0, rSTR1 1241 lwbrx rWORD8, 0, rSTR2 1242 addi rSTR1, rSTR1, 4 1243 addi rSTR2, rSTR2, 4 1244#else 1245 lwzu rWORD7, 16(rSTR1) 1246 lwzu rWORD8, 16(rSTR2) 1247#endif 1248 cmplw cr7, rWORD1, rWORD2 1249 bne- cr1, L(duLcr1) 1250 srw r12, rWORD8, rSHR 1251 slw rWORD8_SHIFT, rWORD8, rSHL 1252 or rWORD8, r12, rWORD6_SHIFT 1253 bdnz+ L(duLoop) 1254 1255L(duL4): 1256#if 0 1257/* Huh? We've already branched on cr1! */ 1258 bne cr1, L(duLcr1) 1259#endif 1260 cmplw cr1, rWORD3, rWORD4 1261 bne cr6, L(duLcr6) 1262 cmplw cr6, rWORD5, rWORD6 1263 bne cr5, L(duLcr5) 1264 cmplw cr5, rWORD7, rWORD8 1265L(du44): 1266 bne cr7, L(duLcr7) 1267L(du34): 1268 bne cr1, L(duLcr1) 1269L(du24): 1270 bne cr6, L(duLcr6) 1271L(du14): 1272 slwi. rN, rN, 3 1273 bne cr5, L(duLcr5) 1274/* At this point we have a remainder of 1 to 3 bytes to compare. We use 1275 shift right to eliminate bits beyond the compare length. 1276 This allows the use of word subtract to compute the final result. 1277 1278 However it may not be safe to load rWORD2 which may be beyond the 1279 string length. So we compare the bit length of the remainder to 1280 the right shift count (rSHR). If the bit count is less than or equal 1281 we do not need to load rWORD2 (all significant bits are already in 1282 rWORD8_SHIFT). */ 1283 cmplw cr7, rN, rSHR 1284 beq L(duZeroReturn) 1285 li r0, 0 1286 ble cr7, L(dutrim) 1287#ifdef __LITTLE_ENDIAN__ 1288 lwbrx rWORD2, 0, rSTR2 1289 addi rSTR2, rSTR2, 4 1290#else 1291 lwz rWORD2, 4(rSTR2) 1292#endif 1293 srw r0, rWORD2, rSHR 1294 .align 4 1295L(dutrim): 1296#ifdef __LITTLE_ENDIAN__ 1297 lwbrx rWORD1, 0, rSTR1 1298#else 1299 lwz rWORD1, 4(rSTR1) 1300#endif 1301 lwz rWORD8, 48(r1) 1302 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */ 1303 or rWORD2, r0, rWORD8_SHIFT 1304 lwz rWORD7, 44(r1) 1305 lwz rSHL, 40(r1) 1306 srw rWORD1, rWORD1, rN 1307 srw rWORD2, rWORD2, rN 1308 lwz rSHR, 36(r1) 1309 lwz rWORD8_SHIFT, 32(r1) 1310 sub rRTN, rWORD1, rWORD2 1311 b L(dureturn26) 1312 .align 4 1313L(duLcr7): 1314 lwz rWORD8, 48(r1) 1315 lwz rWORD7, 44(r1) 1316 li rRTN, 1 1317 bgt cr7, L(dureturn29) 1318 lwz rSHL, 40(r1) 1319 lwz rSHR, 36(r1) 1320 li rRTN, -1 1321 b L(dureturn27) 1322 .align 4 1323L(duLcr1): 1324 lwz rWORD8, 48(r1) 1325 lwz rWORD7, 44(r1) 1326 li rRTN, 1 1327 bgt cr1, L(dureturn29) 1328 lwz rSHL, 40(r1) 1329 lwz rSHR, 36(r1) 1330 li rRTN, -1 1331 b L(dureturn27) 1332 .align 4 1333L(duLcr6): 1334 lwz rWORD8, 48(r1) 1335 lwz rWORD7, 44(r1) 1336 li rRTN, 1 1337 bgt cr6, L(dureturn29) 1338 lwz rSHL, 40(r1) 1339 lwz rSHR, 36(r1) 1340 li rRTN, -1 1341 b L(dureturn27) 1342 .align 4 1343L(duLcr5): 1344 lwz rWORD8, 48(r1) 1345 lwz rWORD7, 44(r1) 1346 li rRTN, 1 1347 bgt cr5, L(dureturn29) 1348 lwz rSHL, 40(r1) 1349 lwz rSHR, 36(r1) 1350 li rRTN, -1 1351 b L(dureturn27) 1352 .align 3 1353L(duZeroReturn): 1354 li rRTN, 0 1355 .align 4 1356L(dureturn): 1357 lwz rWORD8, 48(r1) 1358 lwz rWORD7, 44(r1) 1359L(dureturn29): 1360 lwz rSHL, 40(r1) 1361 lwz rSHR, 36(r1) 1362L(dureturn27): 1363 lwz rWORD8_SHIFT, 32(r1) 1364L(dureturn26): 1365 lwz rWORD2_SHIFT, 28(r1) 1366L(dureturn25): 1367 lwz rWORD4_SHIFT, 24(r1) 1368 lwz rWORD6_SHIFT, 20(r1) 1369 addi 1, 1, 64 1370 cfi_adjust_cfa_offset(-64) 1371 blr 1372END (memcmp) 1373 1374libc_hidden_builtin_def (memcmp) 1375weak_alias (memcmp, bcmp) 1376strong_alias (memcmp, __memcmpeq) 1377libc_hidden_def (__memcmpeq) 1378