1/* Optimized memcmp implementation for PowerPC64. 2 Copyright (C) 2003-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* int [r3] memcmp (const char *s1 [r3], 22 const char *s2 [r4], 23 size_t size [r5]) */ 24 25#ifndef MEMCMP 26# define MEMCMP memcmp 27#endif 28 29#ifndef __LITTLE_ENDIAN__ 30 .machine power4 31#else 32/* Little endian is only available since POWER8, so it's safe to 33 specify .machine as power8 (or older), even though this is a POWER4 34 file. Since the little-endian code uses 'ldbrx', power7 is enough. */ 35 .machine power7 36#endif 37ENTRY_TOCLESS (MEMCMP, 4) 38 CALL_MCOUNT 3 39 40#define rRTN r3 41#define rSTR1 r3 /* first string arg */ 42#define rSTR2 r4 /* second string arg */ 43#define rN r5 /* max string length */ 44#define rWORD1 r6 /* current word in s1 */ 45#define rWORD2 r7 /* current word in s2 */ 46#define rWORD3 r8 /* next word in s1 */ 47#define rWORD4 r9 /* next word in s2 */ 48#define rWORD5 r10 /* next word in s1 */ 49#define rWORD6 r11 /* next word in s2 */ 50#define rWORD7 r30 /* next word in s1 */ 51#define rWORD8 r31 /* next word in s2 */ 52 53 xor r0, rSTR2, rSTR1 54 cmpldi cr6, rN, 0 55 cmpldi cr1, rN, 12 56 clrldi. r0, r0, 61 57 clrldi r12, rSTR1, 61 58 cmpldi cr5, r12, 0 59 beq- cr6, L(zeroLength) 60 dcbt 0, rSTR1 61 dcbt 0, rSTR2 62/* If less than 8 bytes or not aligned, use the unaligned 63 byte loop. */ 64 blt cr1, L(bytealigned) 65 std rWORD8, -8(r1) 66 std rWORD7, -16(r1) 67 cfi_offset(rWORD8, -8) 68 cfi_offset(rWORD7, -16) 69 bne L(unaligned) 70/* At this point we know both strings have the same alignment and the 71 compare length is at least 8 bytes. r12 contains the low order 72 3 bits of rSTR1 and cr5 contains the result of the logical compare 73 of r12 to 0. If r12 == 0 then we are already double word 74 aligned and can perform the DW aligned loop. 75 76 Otherwise we know the two strings have the same alignment (but not 77 yet DW). So we force the string addresses to the next lower DW 78 boundary and special case this first DW using shift left to 79 eliminate bits preceding the first byte. Since we want to join the 80 normal (DW aligned) compare loop, starting at the second double word, 81 we need to adjust the length (rN) and special case the loop 82 versioning for the first DW. This ensures that the loop count is 83 correct and the first DW (shifted) is in the expected register pair. */ 84 .align 4 85L(samealignment): 86 clrrdi rSTR1, rSTR1, 3 87 clrrdi rSTR2, rSTR2, 3 88 beq cr5, L(DWaligned) 89 add rN, rN, r12 90 sldi rWORD6, r12, 3 91 srdi r0, rN, 5 /* Divide by 32 */ 92 andi. r12, rN, 24 /* Get the DW remainder */ 93#ifdef __LITTLE_ENDIAN__ 94 ldbrx rWORD1, 0, rSTR1 95 ldbrx rWORD2, 0, rSTR2 96 addi rSTR1, rSTR1, 8 97 addi rSTR2, rSTR2, 8 98#else 99 ld rWORD1, 0(rSTR1) 100 ld rWORD2, 0(rSTR2) 101#endif 102 cmpldi cr1, r12, 16 103 cmpldi cr7, rN, 32 104 clrldi rN, rN, 61 105 beq L(dPs4) 106 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 107 bgt cr1, L(dPs3) 108 beq cr1, L(dPs2) 109 110/* Remainder is 8 */ 111 .align 3 112L(dsP1): 113 sld rWORD5, rWORD1, rWORD6 114 sld rWORD6, rWORD2, rWORD6 115 cmpld cr5, rWORD5, rWORD6 116 blt cr7, L(dP1x) 117/* Do something useful in this cycle since we have to branch anyway. */ 118#ifdef __LITTLE_ENDIAN__ 119 ldbrx rWORD1, 0, rSTR1 120 ldbrx rWORD2, 0, rSTR2 121 addi rSTR1, rSTR1, 8 122 addi rSTR2, rSTR2, 8 123#else 124 ld rWORD1, 8(rSTR1) 125 ld rWORD2, 8(rSTR2) 126#endif 127 cmpld cr7, rWORD1, rWORD2 128 b L(dP1e) 129/* Remainder is 16 */ 130 .align 4 131L(dPs2): 132 sld rWORD5, rWORD1, rWORD6 133 sld rWORD6, rWORD2, rWORD6 134 cmpld cr6, rWORD5, rWORD6 135 blt cr7, L(dP2x) 136/* Do something useful in this cycle since we have to branch anyway. */ 137#ifdef __LITTLE_ENDIAN__ 138 ldbrx rWORD7, 0, rSTR1 139 ldbrx rWORD8, 0, rSTR2 140 addi rSTR1, rSTR1, 8 141 addi rSTR2, rSTR2, 8 142#else 143 ld rWORD7, 8(rSTR1) 144 ld rWORD8, 8(rSTR2) 145#endif 146 cmpld cr5, rWORD7, rWORD8 147 b L(dP2e) 148/* Remainder is 24 */ 149 .align 4 150L(dPs3): 151 sld rWORD3, rWORD1, rWORD6 152 sld rWORD4, rWORD2, rWORD6 153 cmpld cr1, rWORD3, rWORD4 154 b L(dP3e) 155/* Count is a multiple of 32, remainder is 0 */ 156 .align 4 157L(dPs4): 158 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 159 sld rWORD1, rWORD1, rWORD6 160 sld rWORD2, rWORD2, rWORD6 161 cmpld cr7, rWORD1, rWORD2 162 b L(dP4e) 163 164/* At this point we know both strings are double word aligned and the 165 compare length is at least 8 bytes. */ 166 .align 4 167L(DWaligned): 168 andi. r12, rN, 24 /* Get the DW remainder */ 169 srdi r0, rN, 5 /* Divide by 32 */ 170 cmpldi cr1, r12, 16 171 cmpldi cr7, rN, 32 172 clrldi rN, rN, 61 173 beq L(dP4) 174 bgt cr1, L(dP3) 175 beq cr1, L(dP2) 176 177/* Remainder is 8 */ 178 .align 4 179L(dP1): 180 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 181/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early 182 (8-15 byte compare), we want to use only volatile registers. This 183 means we can avoid restoring non-volatile registers since we did not 184 change any on the early exit path. The key here is the non-early 185 exit path only cares about the condition code (cr5), not about which 186 register pair was used. */ 187#ifdef __LITTLE_ENDIAN__ 188 ldbrx rWORD5, 0, rSTR1 189 ldbrx rWORD6, 0, rSTR2 190 addi rSTR1, rSTR1, 8 191 addi rSTR2, rSTR2, 8 192#else 193 ld rWORD5, 0(rSTR1) 194 ld rWORD6, 0(rSTR2) 195#endif 196 cmpld cr5, rWORD5, rWORD6 197 blt cr7, L(dP1x) 198#ifdef __LITTLE_ENDIAN__ 199 ldbrx rWORD1, 0, rSTR1 200 ldbrx rWORD2, 0, rSTR2 201 addi rSTR1, rSTR1, 8 202 addi rSTR2, rSTR2, 8 203#else 204 ld rWORD1, 8(rSTR1) 205 ld rWORD2, 8(rSTR2) 206#endif 207 cmpld cr7, rWORD1, rWORD2 208L(dP1e): 209#ifdef __LITTLE_ENDIAN__ 210 ldbrx rWORD3, 0, rSTR1 211 ldbrx rWORD4, 0, rSTR2 212 addi rSTR1, rSTR1, 8 213 addi rSTR2, rSTR2, 8 214#else 215 ld rWORD3, 16(rSTR1) 216 ld rWORD4, 16(rSTR2) 217#endif 218 cmpld cr1, rWORD3, rWORD4 219#ifdef __LITTLE_ENDIAN__ 220 ldbrx rWORD5, 0, rSTR1 221 ldbrx rWORD6, 0, rSTR2 222 addi rSTR1, rSTR1, 8 223 addi rSTR2, rSTR2, 8 224#else 225 ld rWORD5, 24(rSTR1) 226 ld rWORD6, 24(rSTR2) 227#endif 228 cmpld cr6, rWORD5, rWORD6 229 bne cr5, L(dLcr5x) 230 bne cr7, L(dLcr7x) 231 232#ifdef __LITTLE_ENDIAN__ 233 ldbrx rWORD7, 0, rSTR1 234 ldbrx rWORD8, 0, rSTR2 235 addi rSTR1, rSTR1, 8 236 addi rSTR2, rSTR2, 8 237#else 238 ldu rWORD7, 32(rSTR1) 239 ldu rWORD8, 32(rSTR2) 240#endif 241 bne cr1, L(dLcr1) 242 cmpld cr5, rWORD7, rWORD8 243 bdnz L(dLoop) 244 bne cr6, L(dLcr6) 245 ld rWORD8, -8(r1) 246 ld rWORD7, -16(r1) 247 .align 3 248L(dP1x): 249 sldi. r12, rN, 3 250 bne cr5, L(dLcr5x) 251 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 252 bne L(d00) 253 li rRTN, 0 254 blr 255 256/* Remainder is 16 */ 257 .align 4 258L(dP2): 259 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 260#ifdef __LITTLE_ENDIAN__ 261 ldbrx rWORD5, 0, rSTR1 262 ldbrx rWORD6, 0, rSTR2 263 addi rSTR1, rSTR1, 8 264 addi rSTR2, rSTR2, 8 265#else 266 ld rWORD5, 0(rSTR1) 267 ld rWORD6, 0(rSTR2) 268#endif 269 cmpld cr6, rWORD5, rWORD6 270 blt cr7, L(dP2x) 271#ifdef __LITTLE_ENDIAN__ 272 ldbrx rWORD7, 0, rSTR1 273 ldbrx rWORD8, 0, rSTR2 274 addi rSTR1, rSTR1, 8 275 addi rSTR2, rSTR2, 8 276#else 277 ld rWORD7, 8(rSTR1) 278 ld rWORD8, 8(rSTR2) 279#endif 280 cmpld cr5, rWORD7, rWORD8 281L(dP2e): 282#ifdef __LITTLE_ENDIAN__ 283 ldbrx rWORD1, 0, rSTR1 284 ldbrx rWORD2, 0, rSTR2 285 addi rSTR1, rSTR1, 8 286 addi rSTR2, rSTR2, 8 287#else 288 ld rWORD1, 16(rSTR1) 289 ld rWORD2, 16(rSTR2) 290#endif 291 cmpld cr7, rWORD1, rWORD2 292#ifdef __LITTLE_ENDIAN__ 293 ldbrx rWORD3, 0, rSTR1 294 ldbrx rWORD4, 0, rSTR2 295 addi rSTR1, rSTR1, 8 296 addi rSTR2, rSTR2, 8 297#else 298 ld rWORD3, 24(rSTR1) 299 ld rWORD4, 24(rSTR2) 300#endif 301 cmpld cr1, rWORD3, rWORD4 302#ifndef __LITTLE_ENDIAN__ 303 addi rSTR1, rSTR1, 8 304 addi rSTR2, rSTR2, 8 305#endif 306 bne cr6, L(dLcr6) 307 bne cr5, L(dLcr5) 308 b L(dLoop2) 309/* Again we are on a early exit path (16-23 byte compare), we want to 310 only use volatile registers and avoid restoring non-volatile 311 registers. */ 312 .align 4 313L(dP2x): 314#ifdef __LITTLE_ENDIAN__ 315 ldbrx rWORD3, 0, rSTR1 316 ldbrx rWORD4, 0, rSTR2 317 addi rSTR1, rSTR1, 8 318 addi rSTR2, rSTR2, 8 319#else 320 ld rWORD3, 8(rSTR1) 321 ld rWORD4, 8(rSTR2) 322#endif 323 cmpld cr1, rWORD3, rWORD4 324 sldi. r12, rN, 3 325 bne cr6, L(dLcr6x) 326#ifndef __LITTLE_ENDIAN__ 327 addi rSTR1, rSTR1, 8 328 addi rSTR2, rSTR2, 8 329#endif 330 bne cr1, L(dLcr1x) 331 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 332 bne L(d00) 333 li rRTN, 0 334 blr 335 336/* Remainder is 24 */ 337 .align 4 338L(dP3): 339 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 340#ifdef __LITTLE_ENDIAN__ 341 ldbrx rWORD3, 0, rSTR1 342 ldbrx rWORD4, 0, rSTR2 343 addi rSTR1, rSTR1, 8 344 addi rSTR2, rSTR2, 8 345#else 346 ld rWORD3, 0(rSTR1) 347 ld rWORD4, 0(rSTR2) 348#endif 349 cmpld cr1, rWORD3, rWORD4 350L(dP3e): 351#ifdef __LITTLE_ENDIAN__ 352 ldbrx rWORD5, 0, rSTR1 353 ldbrx rWORD6, 0, rSTR2 354 addi rSTR1, rSTR1, 8 355 addi rSTR2, rSTR2, 8 356#else 357 ld rWORD5, 8(rSTR1) 358 ld rWORD6, 8(rSTR2) 359#endif 360 cmpld cr6, rWORD5, rWORD6 361 blt cr7, L(dP3x) 362#ifdef __LITTLE_ENDIAN__ 363 ldbrx rWORD7, 0, rSTR1 364 ldbrx rWORD8, 0, rSTR2 365 addi rSTR1, rSTR1, 8 366 addi rSTR2, rSTR2, 8 367#else 368 ld rWORD7, 16(rSTR1) 369 ld rWORD8, 16(rSTR2) 370#endif 371 cmpld cr5, rWORD7, rWORD8 372#ifdef __LITTLE_ENDIAN__ 373 ldbrx rWORD1, 0, rSTR1 374 ldbrx rWORD2, 0, rSTR2 375 addi rSTR1, rSTR1, 8 376 addi rSTR2, rSTR2, 8 377#else 378 ld rWORD1, 24(rSTR1) 379 ld rWORD2, 24(rSTR2) 380#endif 381 cmpld cr7, rWORD1, rWORD2 382#ifndef __LITTLE_ENDIAN__ 383 addi rSTR1, rSTR1, 16 384 addi rSTR2, rSTR2, 16 385#endif 386 bne cr1, L(dLcr1) 387 bne cr6, L(dLcr6) 388 b L(dLoop1) 389/* Again we are on a early exit path (24-31 byte compare), we want to 390 only use volatile registers and avoid restoring non-volatile 391 registers. */ 392 .align 4 393L(dP3x): 394#ifdef __LITTLE_ENDIAN__ 395 ldbrx rWORD1, 0, rSTR1 396 ldbrx rWORD2, 0, rSTR2 397 addi rSTR1, rSTR1, 8 398 addi rSTR2, rSTR2, 8 399#else 400 ld rWORD1, 16(rSTR1) 401 ld rWORD2, 16(rSTR2) 402#endif 403 cmpld cr7, rWORD1, rWORD2 404 sldi. r12, rN, 3 405 bne cr1, L(dLcr1x) 406#ifndef __LITTLE_ENDIAN__ 407 addi rSTR1, rSTR1, 16 408 addi rSTR2, rSTR2, 16 409#endif 410 bne cr6, L(dLcr6x) 411 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 412 bne cr7, L(dLcr7x) 413 bne L(d00) 414 li rRTN, 0 415 blr 416 417/* Count is a multiple of 32, remainder is 0 */ 418 .align 4 419L(dP4): 420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 421#ifdef __LITTLE_ENDIAN__ 422 ldbrx rWORD1, 0, rSTR1 423 ldbrx rWORD2, 0, rSTR2 424 addi rSTR1, rSTR1, 8 425 addi rSTR2, rSTR2, 8 426#else 427 ld rWORD1, 0(rSTR1) 428 ld rWORD2, 0(rSTR2) 429#endif 430 cmpld cr7, rWORD1, rWORD2 431L(dP4e): 432#ifdef __LITTLE_ENDIAN__ 433 ldbrx rWORD3, 0, rSTR1 434 ldbrx rWORD4, 0, rSTR2 435 addi rSTR1, rSTR1, 8 436 addi rSTR2, rSTR2, 8 437#else 438 ld rWORD3, 8(rSTR1) 439 ld rWORD4, 8(rSTR2) 440#endif 441 cmpld cr1, rWORD3, rWORD4 442#ifdef __LITTLE_ENDIAN__ 443 ldbrx rWORD5, 0, rSTR1 444 ldbrx rWORD6, 0, rSTR2 445 addi rSTR1, rSTR1, 8 446 addi rSTR2, rSTR2, 8 447#else 448 ld rWORD5, 16(rSTR1) 449 ld rWORD6, 16(rSTR2) 450#endif 451 cmpld cr6, rWORD5, rWORD6 452#ifdef __LITTLE_ENDIAN__ 453 ldbrx rWORD7, 0, rSTR1 454 ldbrx rWORD8, 0, rSTR2 455 addi rSTR1, rSTR1, 8 456 addi rSTR2, rSTR2, 8 457#else 458 ldu rWORD7, 24(rSTR1) 459 ldu rWORD8, 24(rSTR2) 460#endif 461 cmpld cr5, rWORD7, rWORD8 462 bne cr7, L(dLcr7) 463 bne cr1, L(dLcr1) 464 bdz- L(d24) /* Adjust CTR as we start with +4 */ 465/* This is the primary loop */ 466 .align 4 467L(dLoop): 468#ifdef __LITTLE_ENDIAN__ 469 ldbrx rWORD1, 0, rSTR1 470 ldbrx rWORD2, 0, rSTR2 471 addi rSTR1, rSTR1, 8 472 addi rSTR2, rSTR2, 8 473#else 474 ld rWORD1, 8(rSTR1) 475 ld rWORD2, 8(rSTR2) 476#endif 477 cmpld cr1, rWORD3, rWORD4 478 bne cr6, L(dLcr6) 479L(dLoop1): 480#ifdef __LITTLE_ENDIAN__ 481 ldbrx rWORD3, 0, rSTR1 482 ldbrx rWORD4, 0, rSTR2 483 addi rSTR1, rSTR1, 8 484 addi rSTR2, rSTR2, 8 485#else 486 ld rWORD3, 16(rSTR1) 487 ld rWORD4, 16(rSTR2) 488#endif 489 cmpld cr6, rWORD5, rWORD6 490 bne cr5, L(dLcr5) 491L(dLoop2): 492#ifdef __LITTLE_ENDIAN__ 493 ldbrx rWORD5, 0, rSTR1 494 ldbrx rWORD6, 0, rSTR2 495 addi rSTR1, rSTR1, 8 496 addi rSTR2, rSTR2, 8 497#else 498 ld rWORD5, 24(rSTR1) 499 ld rWORD6, 24(rSTR2) 500#endif 501 cmpld cr5, rWORD7, rWORD8 502 bne cr7, L(dLcr7) 503L(dLoop3): 504#ifdef __LITTLE_ENDIAN__ 505 ldbrx rWORD7, 0, rSTR1 506 ldbrx rWORD8, 0, rSTR2 507 addi rSTR1, rSTR1, 8 508 addi rSTR2, rSTR2, 8 509#else 510 ldu rWORD7, 32(rSTR1) 511 ldu rWORD8, 32(rSTR2) 512#endif 513 bne- cr1, L(dLcr1) 514 cmpld cr7, rWORD1, rWORD2 515 bdnz+ L(dLoop) 516 517L(dL4): 518 cmpld cr1, rWORD3, rWORD4 519 bne cr6, L(dLcr6) 520 cmpld cr6, rWORD5, rWORD6 521 bne cr5, L(dLcr5) 522 cmpld cr5, rWORD7, rWORD8 523L(d44): 524 bne cr7, L(dLcr7) 525L(d34): 526 bne cr1, L(dLcr1) 527L(d24): 528 bne cr6, L(dLcr6) 529L(d14): 530 sldi. r12, rN, 3 531 bne cr5, L(dLcr5) 532L(d04): 533 ld rWORD8, -8(r1) 534 ld rWORD7, -16(r1) 535 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 536 beq L(zeroLength) 537/* At this point we have a remainder of 1 to 7 bytes to compare. Since 538 we are aligned it is safe to load the whole double word, and use 539 shift right double to eliminate bits beyond the compare length. */ 540L(d00): 541#ifdef __LITTLE_ENDIAN__ 542 ldbrx rWORD1, 0, rSTR1 543 ldbrx rWORD2, 0, rSTR2 544 addi rSTR1, rSTR1, 8 545 addi rSTR2, rSTR2, 8 546#else 547 ld rWORD1, 8(rSTR1) 548 ld rWORD2, 8(rSTR2) 549#endif 550 srd rWORD1, rWORD1, rN 551 srd rWORD2, rWORD2, rN 552 cmpld cr7, rWORD1, rWORD2 553 bne cr7, L(dLcr7x) 554 li rRTN, 0 555 blr 556 557 .align 4 558L(dLcr7): 559 ld rWORD8, -8(r1) 560 ld rWORD7, -16(r1) 561L(dLcr7x): 562 li rRTN, 1 563 bgtlr cr7 564 li rRTN, -1 565 blr 566 .align 4 567L(dLcr1): 568 ld rWORD8, -8(r1) 569 ld rWORD7, -16(r1) 570L(dLcr1x): 571 li rRTN, 1 572 bgtlr cr1 573 li rRTN, -1 574 blr 575 .align 4 576L(dLcr6): 577 ld rWORD8, -8(r1) 578 ld rWORD7, -16(r1) 579L(dLcr6x): 580 li rRTN, 1 581 bgtlr cr6 582 li rRTN, -1 583 blr 584 .align 4 585L(dLcr5): 586 ld rWORD8, -8(r1) 587 ld rWORD7, -16(r1) 588L(dLcr5x): 589 li rRTN, 1 590 bgtlr cr5 591 li rRTN, -1 592 blr 593 594 .align 4 595L(bytealigned): 596 mtctr rN /* Power4 wants mtctr 1st in dispatch group */ 597#if 0 598/* Huh? We've already branched on cr6! */ 599 beq- cr6, L(zeroLength) 600#endif 601 602/* We need to prime this loop. This loop is swing modulo scheduled 603 to avoid pipe delays. The dependent instruction latencies (load to 604 compare to conditional branch) is 2 to 3 cycles. In this loop each 605 dispatch group ends in a branch and takes 1 cycle. Effectively 606 the first iteration of the loop only serves to load operands and 607 branches based on compares are delayed until the next loop. 608 609 So we must precondition some registers and condition codes so that 610 we don't exit the loop early on the first iteration. */ 611 612 lbz rWORD1, 0(rSTR1) 613 lbz rWORD2, 0(rSTR2) 614 bdz- L(b11) 615 cmpld cr7, rWORD1, rWORD2 616 lbz rWORD3, 1(rSTR1) 617 lbz rWORD4, 1(rSTR2) 618 bdz- L(b12) 619 cmpld cr1, rWORD3, rWORD4 620 lbzu rWORD5, 2(rSTR1) 621 lbzu rWORD6, 2(rSTR2) 622 bdz- L(b13) 623 .align 4 624L(bLoop): 625 lbzu rWORD1, 1(rSTR1) 626 lbzu rWORD2, 1(rSTR2) 627 bne- cr7, L(bLcr7) 628 629 cmpld cr6, rWORD5, rWORD6 630 bdz- L(b3i) 631 632 lbzu rWORD3, 1(rSTR1) 633 lbzu rWORD4, 1(rSTR2) 634 bne- cr1, L(bLcr1) 635 636 cmpld cr7, rWORD1, rWORD2 637 bdz- L(b2i) 638 639 lbzu rWORD5, 1(rSTR1) 640 lbzu rWORD6, 1(rSTR2) 641 bne- cr6, L(bLcr6) 642 643 cmpld cr1, rWORD3, rWORD4 644 bdnz+ L(bLoop) 645 646/* We speculatively loading bytes before we have tested the previous 647 bytes. But we must avoid overrunning the length (in the ctr) to 648 prevent these speculative loads from causing a segfault. In this 649 case the loop will exit early (before the all pending bytes are 650 tested. In this case we must complete the pending operations 651 before returning. */ 652L(b1i): 653 bne- cr7, L(bLcr7) 654 bne- cr1, L(bLcr1) 655 b L(bx56) 656 .align 4 657L(b2i): 658 bne- cr6, L(bLcr6) 659 bne- cr7, L(bLcr7) 660 b L(bx34) 661 .align 4 662L(b3i): 663 bne- cr1, L(bLcr1) 664 bne- cr6, L(bLcr6) 665 b L(bx12) 666 .align 4 667L(bLcr7): 668 li rRTN, 1 669 bgtlr cr7 670 li rRTN, -1 671 blr 672L(bLcr1): 673 li rRTN, 1 674 bgtlr cr1 675 li rRTN, -1 676 blr 677L(bLcr6): 678 li rRTN, 1 679 bgtlr cr6 680 li rRTN, -1 681 blr 682 683L(b13): 684 bne- cr7, L(bx12) 685 bne- cr1, L(bx34) 686L(bx56): 687 sub rRTN, rWORD5, rWORD6 688 blr 689 nop 690L(b12): 691 bne- cr7, L(bx12) 692L(bx34): 693 sub rRTN, rWORD3, rWORD4 694 blr 695L(b11): 696L(bx12): 697 sub rRTN, rWORD1, rWORD2 698 blr 699 .align 4 700L(zeroLength): 701 li rRTN, 0 702 blr 703 704 .align 4 705/* At this point we know the strings have different alignment and the 706 compare length is at least 8 bytes. r12 contains the low order 707 3 bits of rSTR1 and cr5 contains the result of the logical compare 708 of r12 to 0. If r12 == 0 then rStr1 is double word 709 aligned and can perform the DWunaligned loop. 710 711 Otherwise we know that rSTR1 is not already DW aligned yet. 712 So we can force the string addresses to the next lower DW 713 boundary and special case this first DW using shift left to 714 eliminate bits preceding the first byte. Since we want to join the 715 normal (DWaligned) compare loop, starting at the second double word, 716 we need to adjust the length (rN) and special case the loop 717 versioning for the first DW. This ensures that the loop count is 718 correct and the first DW (shifted) is in the expected resister pair. */ 719#define rSHL r29 /* Unaligned shift left count. */ 720#define rSHR r28 /* Unaligned shift right count. */ 721#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ 722#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ 723#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ 724#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ 725L(unaligned): 726 std rSHL, -24(r1) 727 cfi_offset(rSHL, -24) 728 clrldi rSHL, rSTR2, 61 729 beq- cr6, L(duzeroLength) 730 std rSHR, -32(r1) 731 cfi_offset(rSHR, -32) 732 beq cr5, L(DWunaligned) 733 std rWORD8_SHIFT, -40(r1) 734 cfi_offset(rWORD8_SHIFT, -40) 735/* Adjust the logical start of rSTR2 to compensate for the extra bits 736 in the 1st rSTR1 DW. */ 737 sub rWORD8_SHIFT, rSTR2, r12 738/* But do not attempt to address the DW before that DW that contains 739 the actual start of rSTR2. */ 740 clrrdi rSTR2, rSTR2, 3 741 std rWORD2_SHIFT, -48(r1) 742/* Compute the left/right shift counts for the unaligned rSTR2, 743 compensating for the logical (DW aligned) start of rSTR1. */ 744 clrldi rSHL, rWORD8_SHIFT, 61 745 clrrdi rSTR1, rSTR1, 3 746 std rWORD4_SHIFT, -56(r1) 747 sldi rSHL, rSHL, 3 748 cmpld cr5, rWORD8_SHIFT, rSTR2 749 add rN, rN, r12 750 sldi rWORD6, r12, 3 751 std rWORD6_SHIFT, -64(r1) 752 cfi_offset(rWORD2_SHIFT, -48) 753 cfi_offset(rWORD4_SHIFT, -56) 754 cfi_offset(rWORD6_SHIFT, -64) 755 subfic rSHR, rSHL, 64 756 srdi r0, rN, 5 /* Divide by 32 */ 757 andi. r12, rN, 24 /* Get the DW remainder */ 758/* We normally need to load 2 DWs to start the unaligned rSTR2, but in 759 this special case those bits may be discarded anyway. Also we 760 must avoid loading a DW where none of the bits are part of rSTR2 as 761 this may cross a page boundary and cause a page fault. */ 762 li rWORD8, 0 763 blt cr5, L(dus0) 764#ifdef __LITTLE_ENDIAN__ 765 ldbrx rWORD8, 0, rSTR2 766 addi rSTR2, rSTR2, 8 767#else 768 ld rWORD8, 0(rSTR2) 769 addi rSTR2, rSTR2, 8 770#endif 771 sld rWORD8, rWORD8, rSHL 772 773L(dus0): 774#ifdef __LITTLE_ENDIAN__ 775 ldbrx rWORD1, 0, rSTR1 776 ldbrx rWORD2, 0, rSTR2 777 addi rSTR1, rSTR1, 8 778 addi rSTR2, rSTR2, 8 779#else 780 ld rWORD1, 0(rSTR1) 781 ld rWORD2, 0(rSTR2) 782#endif 783 cmpldi cr1, r12, 16 784 cmpldi cr7, rN, 32 785 srd r12, rWORD2, rSHR 786 clrldi rN, rN, 61 787 beq L(duPs4) 788 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 789 or rWORD8, r12, rWORD8 790 bgt cr1, L(duPs3) 791 beq cr1, L(duPs2) 792 793/* Remainder is 8 */ 794 .align 4 795L(dusP1): 796 sld rWORD8_SHIFT, rWORD2, rSHL 797 sld rWORD7, rWORD1, rWORD6 798 sld rWORD8, rWORD8, rWORD6 799 bge cr7, L(duP1e) 800/* At this point we exit early with the first double word compare 801 complete and remainder of 0 to 7 bytes. See L(du14) for details on 802 how we handle the remaining bytes. */ 803 cmpld cr5, rWORD7, rWORD8 804 sldi. rN, rN, 3 805 bne cr5, L(duLcr5) 806 cmpld cr7, rN, rSHR 807 beq L(duZeroReturn) 808 li r0, 0 809 ble cr7, L(dutrim) 810#ifdef __LITTLE_ENDIAN__ 811 ldbrx rWORD2, 0, rSTR2 812 addi rSTR2, rSTR2, 8 813#else 814 ld rWORD2, 8(rSTR2) 815#endif 816 srd r0, rWORD2, rSHR 817 b L(dutrim) 818/* Remainder is 16 */ 819 .align 4 820L(duPs2): 821 sld rWORD6_SHIFT, rWORD2, rSHL 822 sld rWORD5, rWORD1, rWORD6 823 sld rWORD6, rWORD8, rWORD6 824 b L(duP2e) 825/* Remainder is 24 */ 826 .align 4 827L(duPs3): 828 sld rWORD4_SHIFT, rWORD2, rSHL 829 sld rWORD3, rWORD1, rWORD6 830 sld rWORD4, rWORD8, rWORD6 831 b L(duP3e) 832/* Count is a multiple of 32, remainder is 0 */ 833 .align 4 834L(duPs4): 835 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 836 or rWORD8, r12, rWORD8 837 sld rWORD2_SHIFT, rWORD2, rSHL 838 sld rWORD1, rWORD1, rWORD6 839 sld rWORD2, rWORD8, rWORD6 840 b L(duP4e) 841 842/* At this point we know rSTR1 is double word aligned and the 843 compare length is at least 8 bytes. */ 844 .align 4 845L(DWunaligned): 846 std rWORD8_SHIFT, -40(r1) 847 clrrdi rSTR2, rSTR2, 3 848 std rWORD2_SHIFT, -48(r1) 849 srdi r0, rN, 5 /* Divide by 32 */ 850 std rWORD4_SHIFT, -56(r1) 851 andi. r12, rN, 24 /* Get the DW remainder */ 852 std rWORD6_SHIFT, -64(r1) 853 cfi_offset(rWORD8_SHIFT, -40) 854 cfi_offset(rWORD2_SHIFT, -48) 855 cfi_offset(rWORD4_SHIFT, -56) 856 cfi_offset(rWORD6_SHIFT, -64) 857 sldi rSHL, rSHL, 3 858#ifdef __LITTLE_ENDIAN__ 859 ldbrx rWORD6, 0, rSTR2 860 addi rSTR2, rSTR2, 8 861 ldbrx rWORD8, 0, rSTR2 862 addi rSTR2, rSTR2, 8 863#else 864 ld rWORD6, 0(rSTR2) 865 ldu rWORD8, 8(rSTR2) 866#endif 867 cmpldi cr1, r12, 16 868 cmpldi cr7, rN, 32 869 clrldi rN, rN, 61 870 subfic rSHR, rSHL, 64 871 sld rWORD6_SHIFT, rWORD6, rSHL 872 beq L(duP4) 873 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 874 bgt cr1, L(duP3) 875 beq cr1, L(duP2) 876 877/* Remainder is 8 */ 878 .align 4 879L(duP1): 880 srd r12, rWORD8, rSHR 881#ifdef __LITTLE_ENDIAN__ 882 ldbrx rWORD7, 0, rSTR1 883 addi rSTR1, rSTR1, 8 884#else 885 ld rWORD7, 0(rSTR1) 886#endif 887 sld rWORD8_SHIFT, rWORD8, rSHL 888 or rWORD8, r12, rWORD6_SHIFT 889 blt cr7, L(duP1x) 890L(duP1e): 891#ifdef __LITTLE_ENDIAN__ 892 ldbrx rWORD1, 0, rSTR1 893 ldbrx rWORD2, 0, rSTR2 894 addi rSTR1, rSTR1, 8 895 addi rSTR2, rSTR2, 8 896#else 897 ld rWORD1, 8(rSTR1) 898 ld rWORD2, 8(rSTR2) 899#endif 900 cmpld cr5, rWORD7, rWORD8 901 srd r0, rWORD2, rSHR 902 sld rWORD2_SHIFT, rWORD2, rSHL 903 or rWORD2, r0, rWORD8_SHIFT 904#ifdef __LITTLE_ENDIAN__ 905 ldbrx rWORD3, 0, rSTR1 906 ldbrx rWORD4, 0, rSTR2 907 addi rSTR1, rSTR1, 8 908 addi rSTR2, rSTR2, 8 909#else 910 ld rWORD3, 16(rSTR1) 911 ld rWORD4, 16(rSTR2) 912#endif 913 cmpld cr7, rWORD1, rWORD2 914 srd r12, rWORD4, rSHR 915 sld rWORD4_SHIFT, rWORD4, rSHL 916 bne cr5, L(duLcr5) 917 or rWORD4, r12, rWORD2_SHIFT 918#ifdef __LITTLE_ENDIAN__ 919 ldbrx rWORD5, 0, rSTR1 920 ldbrx rWORD6, 0, rSTR2 921 addi rSTR1, rSTR1, 8 922 addi rSTR2, rSTR2, 8 923#else 924 ld rWORD5, 24(rSTR1) 925 ld rWORD6, 24(rSTR2) 926#endif 927 cmpld cr1, rWORD3, rWORD4 928 srd r0, rWORD6, rSHR 929 sld rWORD6_SHIFT, rWORD6, rSHL 930 bne cr7, L(duLcr7) 931 or rWORD6, r0, rWORD4_SHIFT 932 cmpld cr6, rWORD5, rWORD6 933 b L(duLoop3) 934 .align 4 935/* At this point we exit early with the first double word compare 936 complete and remainder of 0 to 7 bytes. See L(du14) for details on 937 how we handle the remaining bytes. */ 938L(duP1x): 939 cmpld cr5, rWORD7, rWORD8 940 sldi. rN, rN, 3 941 bne cr5, L(duLcr5) 942 cmpld cr7, rN, rSHR 943 beq L(duZeroReturn) 944 li r0, 0 945 ble cr7, L(dutrim) 946#ifdef __LITTLE_ENDIAN__ 947 ldbrx rWORD2, 0, rSTR2 948 addi rSTR2, rSTR2, 8 949#else 950 ld rWORD2, 8(rSTR2) 951#endif 952 srd r0, rWORD2, rSHR 953 b L(dutrim) 954/* Remainder is 16 */ 955 .align 4 956L(duP2): 957 srd r0, rWORD8, rSHR 958#ifdef __LITTLE_ENDIAN__ 959 ldbrx rWORD5, 0, rSTR1 960 addi rSTR1, rSTR1, 8 961#else 962 ld rWORD5, 0(rSTR1) 963#endif 964 or rWORD6, r0, rWORD6_SHIFT 965 sld rWORD6_SHIFT, rWORD8, rSHL 966L(duP2e): 967#ifdef __LITTLE_ENDIAN__ 968 ldbrx rWORD7, 0, rSTR1 969 ldbrx rWORD8, 0, rSTR2 970 addi rSTR1, rSTR1, 8 971 addi rSTR2, rSTR2, 8 972#else 973 ld rWORD7, 8(rSTR1) 974 ld rWORD8, 8(rSTR2) 975#endif 976 cmpld cr6, rWORD5, rWORD6 977 srd r12, rWORD8, rSHR 978 sld rWORD8_SHIFT, rWORD8, rSHL 979 or rWORD8, r12, rWORD6_SHIFT 980 blt cr7, L(duP2x) 981#ifdef __LITTLE_ENDIAN__ 982 ldbrx rWORD1, 0, rSTR1 983 ldbrx rWORD2, 0, rSTR2 984 addi rSTR1, rSTR1, 8 985 addi rSTR2, rSTR2, 8 986#else 987 ld rWORD1, 16(rSTR1) 988 ld rWORD2, 16(rSTR2) 989#endif 990 cmpld cr5, rWORD7, rWORD8 991 bne cr6, L(duLcr6) 992 srd r0, rWORD2, rSHR 993 sld rWORD2_SHIFT, rWORD2, rSHL 994 or rWORD2, r0, rWORD8_SHIFT 995#ifdef __LITTLE_ENDIAN__ 996 ldbrx rWORD3, 0, rSTR1 997 ldbrx rWORD4, 0, rSTR2 998 addi rSTR1, rSTR1, 8 999 addi rSTR2, rSTR2, 8 1000#else 1001 ld rWORD3, 24(rSTR1) 1002 ld rWORD4, 24(rSTR2) 1003#endif 1004 cmpld cr7, rWORD1, rWORD2 1005 bne cr5, L(duLcr5) 1006 srd r12, rWORD4, rSHR 1007 sld rWORD4_SHIFT, rWORD4, rSHL 1008 or rWORD4, r12, rWORD2_SHIFT 1009#ifndef __LITTLE_ENDIAN__ 1010 addi rSTR1, rSTR1, 8 1011 addi rSTR2, rSTR2, 8 1012#endif 1013 cmpld cr1, rWORD3, rWORD4 1014 b L(duLoop2) 1015 .align 4 1016L(duP2x): 1017 cmpld cr5, rWORD7, rWORD8 1018#ifndef __LITTLE_ENDIAN__ 1019 addi rSTR1, rSTR1, 8 1020 addi rSTR2, rSTR2, 8 1021#endif 1022 bne cr6, L(duLcr6) 1023 sldi. rN, rN, 3 1024 bne cr5, L(duLcr5) 1025 cmpld cr7, rN, rSHR 1026 beq L(duZeroReturn) 1027 li r0, 0 1028 ble cr7, L(dutrim) 1029#ifdef __LITTLE_ENDIAN__ 1030 ldbrx rWORD2, 0, rSTR2 1031 addi rSTR2, rSTR2, 8 1032#else 1033 ld rWORD2, 8(rSTR2) 1034#endif 1035 srd r0, rWORD2, rSHR 1036 b L(dutrim) 1037 1038/* Remainder is 24 */ 1039 .align 4 1040L(duP3): 1041 srd r12, rWORD8, rSHR 1042#ifdef __LITTLE_ENDIAN__ 1043 ldbrx rWORD3, 0, rSTR1 1044 addi rSTR1, rSTR1, 8 1045#else 1046 ld rWORD3, 0(rSTR1) 1047#endif 1048 sld rWORD4_SHIFT, rWORD8, rSHL 1049 or rWORD4, r12, rWORD6_SHIFT 1050L(duP3e): 1051#ifdef __LITTLE_ENDIAN__ 1052 ldbrx rWORD5, 0, rSTR1 1053 ldbrx rWORD6, 0, rSTR2 1054 addi rSTR1, rSTR1, 8 1055 addi rSTR2, rSTR2, 8 1056#else 1057 ld rWORD5, 8(rSTR1) 1058 ld rWORD6, 8(rSTR2) 1059#endif 1060 cmpld cr1, rWORD3, rWORD4 1061 srd r0, rWORD6, rSHR 1062 sld rWORD6_SHIFT, rWORD6, rSHL 1063 or rWORD6, r0, rWORD4_SHIFT 1064#ifdef __LITTLE_ENDIAN__ 1065 ldbrx rWORD7, 0, rSTR1 1066 ldbrx rWORD8, 0, rSTR2 1067 addi rSTR1, rSTR1, 8 1068 addi rSTR2, rSTR2, 8 1069#else 1070 ld rWORD7, 16(rSTR1) 1071 ld rWORD8, 16(rSTR2) 1072#endif 1073 cmpld cr6, rWORD5, rWORD6 1074 bne cr1, L(duLcr1) 1075 srd r12, rWORD8, rSHR 1076 sld rWORD8_SHIFT, rWORD8, rSHL 1077 or rWORD8, r12, rWORD6_SHIFT 1078 blt cr7, L(duP3x) 1079#ifdef __LITTLE_ENDIAN__ 1080 ldbrx rWORD1, 0, rSTR1 1081 ldbrx rWORD2, 0, rSTR2 1082 addi rSTR1, rSTR1, 8 1083 addi rSTR2, rSTR2, 8 1084#else 1085 ld rWORD1, 24(rSTR1) 1086 ld rWORD2, 24(rSTR2) 1087#endif 1088 cmpld cr5, rWORD7, rWORD8 1089 bne cr6, L(duLcr6) 1090 srd r0, rWORD2, rSHR 1091 sld rWORD2_SHIFT, rWORD2, rSHL 1092 or rWORD2, r0, rWORD8_SHIFT 1093#ifndef __LITTLE_ENDIAN__ 1094 addi rSTR1, rSTR1, 16 1095 addi rSTR2, rSTR2, 16 1096#endif 1097 cmpld cr7, rWORD1, rWORD2 1098 b L(duLoop1) 1099 .align 4 1100L(duP3x): 1101#ifndef __LITTLE_ENDIAN__ 1102 addi rSTR1, rSTR1, 16 1103 addi rSTR2, rSTR2, 16 1104#endif 1105#if 0 1106/* Huh? We've already branched on cr1! */ 1107 bne cr1, L(duLcr1) 1108#endif 1109 cmpld cr5, rWORD7, rWORD8 1110 bne cr6, L(duLcr6) 1111 sldi. rN, rN, 3 1112 bne cr5, L(duLcr5) 1113 cmpld cr7, rN, rSHR 1114 beq L(duZeroReturn) 1115 li r0, 0 1116 ble cr7, L(dutrim) 1117#ifdef __LITTLE_ENDIAN__ 1118 ldbrx rWORD2, 0, rSTR2 1119 addi rSTR2, rSTR2, 8 1120#else 1121 ld rWORD2, 8(rSTR2) 1122#endif 1123 srd r0, rWORD2, rSHR 1124 b L(dutrim) 1125 1126/* Count is a multiple of 32, remainder is 0 */ 1127 .align 4 1128L(duP4): 1129 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ 1130 srd r0, rWORD8, rSHR 1131#ifdef __LITTLE_ENDIAN__ 1132 ldbrx rWORD1, 0, rSTR1 1133 addi rSTR1, rSTR1, 8 1134#else 1135 ld rWORD1, 0(rSTR1) 1136#endif 1137 sld rWORD2_SHIFT, rWORD8, rSHL 1138 or rWORD2, r0, rWORD6_SHIFT 1139L(duP4e): 1140#ifdef __LITTLE_ENDIAN__ 1141 ldbrx rWORD3, 0, rSTR1 1142 ldbrx rWORD4, 0, rSTR2 1143 addi rSTR1, rSTR1, 8 1144 addi rSTR2, rSTR2, 8 1145#else 1146 ld rWORD3, 8(rSTR1) 1147 ld rWORD4, 8(rSTR2) 1148#endif 1149 cmpld cr7, rWORD1, rWORD2 1150 srd r12, rWORD4, rSHR 1151 sld rWORD4_SHIFT, rWORD4, rSHL 1152 or rWORD4, r12, rWORD2_SHIFT 1153#ifdef __LITTLE_ENDIAN__ 1154 ldbrx rWORD5, 0, rSTR1 1155 ldbrx rWORD6, 0, rSTR2 1156 addi rSTR1, rSTR1, 8 1157 addi rSTR2, rSTR2, 8 1158#else 1159 ld rWORD5, 16(rSTR1) 1160 ld rWORD6, 16(rSTR2) 1161#endif 1162 cmpld cr1, rWORD3, rWORD4 1163 bne cr7, L(duLcr7) 1164 srd r0, rWORD6, rSHR 1165 sld rWORD6_SHIFT, rWORD6, rSHL 1166 or rWORD6, r0, rWORD4_SHIFT 1167#ifdef __LITTLE_ENDIAN__ 1168 ldbrx rWORD7, 0, rSTR1 1169 ldbrx rWORD8, 0, rSTR2 1170 addi rSTR1, rSTR1, 8 1171 addi rSTR2, rSTR2, 8 1172#else 1173 ldu rWORD7, 24(rSTR1) 1174 ldu rWORD8, 24(rSTR2) 1175#endif 1176 cmpld cr6, rWORD5, rWORD6 1177 bne cr1, L(duLcr1) 1178 srd r12, rWORD8, rSHR 1179 sld rWORD8_SHIFT, rWORD8, rSHL 1180 or rWORD8, r12, rWORD6_SHIFT 1181 cmpld cr5, rWORD7, rWORD8 1182 bdz- L(du24) /* Adjust CTR as we start with +4 */ 1183/* This is the primary loop */ 1184 .align 4 1185L(duLoop): 1186#ifdef __LITTLE_ENDIAN__ 1187 ldbrx rWORD1, 0, rSTR1 1188 ldbrx rWORD2, 0, rSTR2 1189 addi rSTR1, rSTR1, 8 1190 addi rSTR2, rSTR2, 8 1191#else 1192 ld rWORD1, 8(rSTR1) 1193 ld rWORD2, 8(rSTR2) 1194#endif 1195 cmpld cr1, rWORD3, rWORD4 1196 bne cr6, L(duLcr6) 1197 srd r0, rWORD2, rSHR 1198 sld rWORD2_SHIFT, rWORD2, rSHL 1199 or rWORD2, r0, rWORD8_SHIFT 1200L(duLoop1): 1201#ifdef __LITTLE_ENDIAN__ 1202 ldbrx rWORD3, 0, rSTR1 1203 ldbrx rWORD4, 0, rSTR2 1204 addi rSTR1, rSTR1, 8 1205 addi rSTR2, rSTR2, 8 1206#else 1207 ld rWORD3, 16(rSTR1) 1208 ld rWORD4, 16(rSTR2) 1209#endif 1210 cmpld cr6, rWORD5, rWORD6 1211 bne cr5, L(duLcr5) 1212 srd r12, rWORD4, rSHR 1213 sld rWORD4_SHIFT, rWORD4, rSHL 1214 or rWORD4, r12, rWORD2_SHIFT 1215L(duLoop2): 1216#ifdef __LITTLE_ENDIAN__ 1217 ldbrx rWORD5, 0, rSTR1 1218 ldbrx rWORD6, 0, rSTR2 1219 addi rSTR1, rSTR1, 8 1220 addi rSTR2, rSTR2, 8 1221#else 1222 ld rWORD5, 24(rSTR1) 1223 ld rWORD6, 24(rSTR2) 1224#endif 1225 cmpld cr5, rWORD7, rWORD8 1226 bne cr7, L(duLcr7) 1227 srd r0, rWORD6, rSHR 1228 sld rWORD6_SHIFT, rWORD6, rSHL 1229 or rWORD6, r0, rWORD4_SHIFT 1230L(duLoop3): 1231#ifdef __LITTLE_ENDIAN__ 1232 ldbrx rWORD7, 0, rSTR1 1233 ldbrx rWORD8, 0, rSTR2 1234 addi rSTR1, rSTR1, 8 1235 addi rSTR2, rSTR2, 8 1236#else 1237 ldu rWORD7, 32(rSTR1) 1238 ldu rWORD8, 32(rSTR2) 1239#endif 1240 cmpld cr7, rWORD1, rWORD2 1241 bne- cr1, L(duLcr1) 1242 srd r12, rWORD8, rSHR 1243 sld rWORD8_SHIFT, rWORD8, rSHL 1244 or rWORD8, r12, rWORD6_SHIFT 1245 bdnz+ L(duLoop) 1246 1247L(duL4): 1248#if 0 1249/* Huh? We've already branched on cr1! */ 1250 bne cr1, L(duLcr1) 1251#endif 1252 cmpld cr1, rWORD3, rWORD4 1253 bne cr6, L(duLcr6) 1254 cmpld cr6, rWORD5, rWORD6 1255 bne cr5, L(duLcr5) 1256 cmpld cr5, rWORD7, rWORD8 1257L(du44): 1258 bne cr7, L(duLcr7) 1259L(du34): 1260 bne cr1, L(duLcr1) 1261L(du24): 1262 bne cr6, L(duLcr6) 1263L(du14): 1264 sldi. rN, rN, 3 1265 bne cr5, L(duLcr5) 1266/* At this point we have a remainder of 1 to 7 bytes to compare. We use 1267 shift right double to eliminate bits beyond the compare length. 1268 1269 However it may not be safe to load rWORD2 which may be beyond the 1270 string length. So we compare the bit length of the remainder to 1271 the right shift count (rSHR). If the bit count is less than or equal 1272 we do not need to load rWORD2 (all significant bits are already in 1273 rWORD8_SHIFT). */ 1274 cmpld cr7, rN, rSHR 1275 beq L(duZeroReturn) 1276 li r0, 0 1277 ble cr7, L(dutrim) 1278#ifdef __LITTLE_ENDIAN__ 1279 ldbrx rWORD2, 0, rSTR2 1280 addi rSTR2, rSTR2, 8 1281#else 1282 ld rWORD2, 8(rSTR2) 1283#endif 1284 srd r0, rWORD2, rSHR 1285 .align 4 1286L(dutrim): 1287#ifdef __LITTLE_ENDIAN__ 1288 ldbrx rWORD1, 0, rSTR1 1289#else 1290 ld rWORD1, 8(rSTR1) 1291#endif 1292 ld rWORD8, -8(r1) 1293 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ 1294 or rWORD2, r0, rWORD8_SHIFT 1295 ld rWORD7, -16(r1) 1296 ld rSHL, -24(r1) 1297 srd rWORD1, rWORD1, rN 1298 srd rWORD2, rWORD2, rN 1299 ld rSHR, -32(r1) 1300 ld rWORD8_SHIFT, -40(r1) 1301 li rRTN, 0 1302 cmpld cr7, rWORD1, rWORD2 1303 ld rWORD2_SHIFT, -48(r1) 1304 ld rWORD4_SHIFT, -56(r1) 1305 beq cr7, L(dureturn24) 1306 li rRTN, 1 1307 ld rWORD6_SHIFT, -64(r1) 1308 bgtlr cr7 1309 li rRTN, -1 1310 blr 1311 .align 4 1312L(duLcr7): 1313 ld rWORD8, -8(r1) 1314 ld rWORD7, -16(r1) 1315 li rRTN, 1 1316 bgt cr7, L(dureturn29) 1317 ld rSHL, -24(r1) 1318 ld rSHR, -32(r1) 1319 li rRTN, -1 1320 b L(dureturn27) 1321 .align 4 1322L(duLcr1): 1323 ld rWORD8, -8(r1) 1324 ld rWORD7, -16(r1) 1325 li rRTN, 1 1326 bgt cr1, L(dureturn29) 1327 ld rSHL, -24(r1) 1328 ld rSHR, -32(r1) 1329 li rRTN, -1 1330 b L(dureturn27) 1331 .align 4 1332L(duLcr6): 1333 ld rWORD8, -8(r1) 1334 ld rWORD7, -16(r1) 1335 li rRTN, 1 1336 bgt cr6, L(dureturn29) 1337 ld rSHL, -24(r1) 1338 ld rSHR, -32(r1) 1339 li rRTN, -1 1340 b L(dureturn27) 1341 .align 4 1342L(duLcr5): 1343 ld rWORD8, -8(r1) 1344 ld rWORD7, -16(r1) 1345 li rRTN, 1 1346 bgt cr5, L(dureturn29) 1347 ld rSHL, -24(r1) 1348 ld rSHR, -32(r1) 1349 li rRTN, -1 1350 b L(dureturn27) 1351 .align 3 1352L(duZeroReturn): 1353 li rRTN, 0 1354 .align 4 1355L(dureturn): 1356 ld rWORD8, -8(r1) 1357 ld rWORD7, -16(r1) 1358L(dureturn29): 1359 ld rSHL, -24(r1) 1360 ld rSHR, -32(r1) 1361L(dureturn27): 1362 ld rWORD8_SHIFT, -40(r1) 1363L(dureturn26): 1364 ld rWORD2_SHIFT, -48(r1) 1365L(dureturn25): 1366 ld rWORD4_SHIFT, -56(r1) 1367L(dureturn24): 1368 ld rWORD6_SHIFT, -64(r1) 1369 blr 1370L(duzeroLength): 1371 li rRTN, 0 1372 blr 1373 1374END (MEMCMP) 1375libc_hidden_builtin_def (memcmp) 1376weak_alias (memcmp, bcmp) 1377strong_alias (memcmp, __memcmpeq) 1378libc_hidden_def (__memcmpeq) 1379