1/* memcpy with SSSE3 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) \ 20 && (defined SHARED \ 21 || defined USE_AS_MEMMOVE \ 22 || !defined USE_MULTIARCH) 23 24# include <sysdep.h> 25# include "asm-syntax.h" 26 27# ifndef MEMCPY 28# define MEMCPY __memcpy_ssse3 29# define MEMCPY_CHK __memcpy_chk_ssse3 30# endif 31 32# define DEST PARMS 33# define SRC DEST+4 34# define LEN SRC+4 35 36# define CFI_PUSH(REG) \ 37 cfi_adjust_cfa_offset (4); \ 38 cfi_rel_offset (REG, 0) 39 40# define CFI_POP(REG) \ 41 cfi_adjust_cfa_offset (-4); \ 42 cfi_restore (REG) 43 44# define PUSH(REG) pushl REG; CFI_PUSH (REG) 45# define POP(REG) popl REG; CFI_POP (REG) 46 47# ifdef PIC 48# define PARMS 8 /* Preserve EBX. */ 49# define ENTRANCE PUSH (%ebx); 50# define RETURN_END POP (%ebx); ret 51# define RETURN RETURN_END; CFI_PUSH (%ebx) 52# define JMPTBL(I, B) I - B 53 54/* Load an entry in a jump table into EBX and branch to it. TABLE is a 55 jump table with relative offsets. INDEX is a register contains the 56 index into the jump table. SCALE is the scale of INDEX. */ 57 58# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 59 /* We first load PC into EBX. */ \ 60 SETUP_PIC_REG(bx); \ 61 /* Get the address of the jump table. */ \ 62 addl $(TABLE - .), %ebx; \ 63 /* Get the entry and convert the relative offset to the \ 64 absolute address. */ \ 65 addl (%ebx, INDEX, SCALE), %ebx; \ 66 /* We loaded the jump table. Go. */ \ 67 _CET_NOTRACK jmp *%ebx 68# else 69 70# define PARMS 4 71# define ENTRANCE 72# define RETURN_END ret 73# define RETURN RETURN_END 74# define JMPTBL(I, B) I 75 76/* Branch to an entry in a jump table. TABLE is a jump table with 77 absolute offsets. INDEX is a register contains the index into the 78 jump table. SCALE is the scale of INDEX. */ 79 80# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 81 _CET_NOTRACK jmp *TABLE(, INDEX, SCALE) 82# endif 83 84 .section .text.ssse3,"ax",@progbits 85# ifdef SHARED 86ENTRY (MEMCPY_CHK) 87 movl 12(%esp), %eax 88 cmpl %eax, 16(%esp) 89 jb HIDDEN_JUMPTARGET (__chk_fail) 90END (MEMCPY_CHK) 91# endif 92ENTRY (MEMCPY) 93 ENTRANCE 94 movl LEN(%esp), %ecx 95 movl SRC(%esp), %eax 96 movl DEST(%esp), %edx 97 98# ifdef USE_AS_MEMMOVE 99 cmp %eax, %edx 100 jb L(copy_forward) 101 je L(fwd_write_0bytes) 102 cmp $32, %ecx 103 jae L(memmove_bwd) 104 jmp L(bk_write_less32bytes_2) 105 106 .p2align 4 107L(memmove_bwd): 108 add %ecx, %eax 109 cmp %eax, %edx 110 movl SRC(%esp), %eax 111 jb L(copy_backward) 112 113L(copy_forward): 114# endif 115 cmp $48, %ecx 116 jae L(48bytesormore) 117 118L(fwd_write_less32bytes): 119# ifndef USE_AS_MEMMOVE 120 cmp %dl, %al 121 jb L(bk_write) 122# endif 123 add %ecx, %edx 124 add %ecx, %eax 125 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 126# ifndef USE_AS_MEMMOVE 127 .p2align 4 128L(bk_write): 129 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 130# endif 131 132 .p2align 4 133L(48bytesormore): 134# ifndef USE_AS_MEMMOVE 135 movlpd (%eax), %xmm0 136 movlpd 8(%eax), %xmm1 137 movlpd %xmm0, (%edx) 138 movlpd %xmm1, 8(%edx) 139# else 140 movdqu (%eax), %xmm0 141# endif 142 PUSH (%edi) 143 movl %edx, %edi 144 and $-16, %edx 145 add $16, %edx 146 sub %edx, %edi 147 add %edi, %ecx 148 sub %edi, %eax 149 150# ifdef SHARED_CACHE_SIZE_HALF 151 cmp $SHARED_CACHE_SIZE_HALF, %ecx 152# else 153# ifdef PIC 154 SETUP_PIC_REG(bx) 155 add $_GLOBAL_OFFSET_TABLE_, %ebx 156 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 157# else 158 cmp __x86_shared_cache_size_half, %ecx 159# endif 160# endif 161 162 mov %eax, %edi 163 jae L(large_page) 164 and $0xf, %edi 165 jz L(shl_0) 166 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 167 168 .p2align 4 169L(shl_0): 170# ifdef USE_AS_MEMMOVE 171 movl DEST+4(%esp), %edi 172 movdqu %xmm0, (%edi) 173# endif 174 xor %edi, %edi 175 cmp $127, %ecx 176 ja L(shl_0_gobble) 177 lea -32(%ecx), %ecx 178 179 .p2align 4 180L(shl_0_loop): 181 movdqa (%eax, %edi), %xmm0 182 movdqa 16(%eax, %edi), %xmm1 183 sub $32, %ecx 184 movdqa %xmm0, (%edx, %edi) 185 movdqa %xmm1, 16(%edx, %edi) 186 lea 32(%edi), %edi 187 jb L(shl_0_end) 188 189 movdqa (%eax, %edi), %xmm0 190 movdqa 16(%eax, %edi), %xmm1 191 sub $32, %ecx 192 movdqa %xmm0, (%edx, %edi) 193 movdqa %xmm1, 16(%edx, %edi) 194 lea 32(%edi), %edi 195 jb L(shl_0_end) 196 197 movdqa (%eax, %edi), %xmm0 198 movdqa 16(%eax, %edi), %xmm1 199 sub $32, %ecx 200 movdqa %xmm0, (%edx, %edi) 201 movdqa %xmm1, 16(%edx, %edi) 202 lea 32(%edi), %edi 203 jb L(shl_0_end) 204 205 movdqa (%eax, %edi), %xmm0 206 movdqa 16(%eax, %edi), %xmm1 207 sub $32, %ecx 208 movdqa %xmm0, (%edx, %edi) 209 movdqa %xmm1, 16(%edx, %edi) 210 lea 32(%edi), %edi 211 212L(shl_0_end): 213 lea 32(%ecx), %ecx 214 add %ecx, %edi 215 add %edi, %edx 216 add %edi, %eax 217 POP (%edi) 218 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 219 220 CFI_PUSH (%edi) 221 222 .p2align 4 223L(shl_0_gobble): 224# ifdef DATA_CACHE_SIZE_HALF 225 cmp $DATA_CACHE_SIZE_HALF, %ecx 226# else 227# ifdef PIC 228 SETUP_PIC_REG(bx) 229 add $_GLOBAL_OFFSET_TABLE_, %ebx 230 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 231# else 232 cmp __x86_data_cache_size_half, %ecx 233# endif 234# endif 235 POP (%edi) 236 lea -128(%ecx), %ecx 237 jae L(shl_0_gobble_mem_loop) 238 239 .p2align 4 240L(shl_0_gobble_cache_loop): 241 movdqa (%eax), %xmm0 242 movdqa 0x10(%eax), %xmm1 243 movdqa 0x20(%eax), %xmm2 244 movdqa 0x30(%eax), %xmm3 245 movdqa 0x40(%eax), %xmm4 246 movdqa 0x50(%eax), %xmm5 247 movdqa 0x60(%eax), %xmm6 248 movdqa 0x70(%eax), %xmm7 249 lea 0x80(%eax), %eax 250 sub $128, %ecx 251 movdqa %xmm0, (%edx) 252 movdqa %xmm1, 0x10(%edx) 253 movdqa %xmm2, 0x20(%edx) 254 movdqa %xmm3, 0x30(%edx) 255 movdqa %xmm4, 0x40(%edx) 256 movdqa %xmm5, 0x50(%edx) 257 movdqa %xmm6, 0x60(%edx) 258 movdqa %xmm7, 0x70(%edx) 259 lea 0x80(%edx), %edx 260 261 jae L(shl_0_gobble_cache_loop) 262 cmp $-0x40, %ecx 263 lea 0x80(%ecx), %ecx 264 jl L(shl_0_cache_less_64bytes) 265 266 movdqa (%eax), %xmm0 267 sub $0x40, %ecx 268 movdqa 0x10(%eax), %xmm1 269 movdqa %xmm0, (%edx) 270 movdqa %xmm1, 0x10(%edx) 271 movdqa 0x20(%eax), %xmm0 272 movdqa 0x30(%eax), %xmm1 273 add $0x40, %eax 274 movdqa %xmm0, 0x20(%edx) 275 movdqa %xmm1, 0x30(%edx) 276 add $0x40, %edx 277 278L(shl_0_cache_less_64bytes): 279 cmp $0x20, %ecx 280 jb L(shl_0_cache_less_32bytes) 281 movdqa (%eax), %xmm0 282 sub $0x20, %ecx 283 movdqa 0x10(%eax), %xmm1 284 add $0x20, %eax 285 movdqa %xmm0, (%edx) 286 movdqa %xmm1, 0x10(%edx) 287 add $0x20, %edx 288 289L(shl_0_cache_less_32bytes): 290 cmp $0x10, %ecx 291 jb L(shl_0_cache_less_16bytes) 292 sub $0x10, %ecx 293 movdqa (%eax), %xmm0 294 add $0x10, %eax 295 movdqa %xmm0, (%edx) 296 add $0x10, %edx 297 298L(shl_0_cache_less_16bytes): 299 add %ecx, %edx 300 add %ecx, %eax 301 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 302 303 .p2align 4 304L(shl_0_gobble_mem_loop): 305 prefetcht0 0x1c0(%eax) 306 prefetcht0 0x280(%eax) 307 prefetcht0 0x1c0(%edx) 308 309 movdqa (%eax), %xmm0 310 movdqa 0x10(%eax), %xmm1 311 movdqa 0x20(%eax), %xmm2 312 movdqa 0x30(%eax), %xmm3 313 movdqa 0x40(%eax), %xmm4 314 movdqa 0x50(%eax), %xmm5 315 movdqa 0x60(%eax), %xmm6 316 movdqa 0x70(%eax), %xmm7 317 lea 0x80(%eax), %eax 318 sub $0x80, %ecx 319 movdqa %xmm0, (%edx) 320 movdqa %xmm1, 0x10(%edx) 321 movdqa %xmm2, 0x20(%edx) 322 movdqa %xmm3, 0x30(%edx) 323 movdqa %xmm4, 0x40(%edx) 324 movdqa %xmm5, 0x50(%edx) 325 movdqa %xmm6, 0x60(%edx) 326 movdqa %xmm7, 0x70(%edx) 327 lea 0x80(%edx), %edx 328 329 jae L(shl_0_gobble_mem_loop) 330 cmp $-0x40, %ecx 331 lea 0x80(%ecx), %ecx 332 jl L(shl_0_mem_less_64bytes) 333 334 movdqa (%eax), %xmm0 335 sub $0x40, %ecx 336 movdqa 0x10(%eax), %xmm1 337 338 movdqa %xmm0, (%edx) 339 movdqa %xmm1, 0x10(%edx) 340 341 movdqa 0x20(%eax), %xmm0 342 movdqa 0x30(%eax), %xmm1 343 add $0x40, %eax 344 345 movdqa %xmm0, 0x20(%edx) 346 movdqa %xmm1, 0x30(%edx) 347 add $0x40, %edx 348 349L(shl_0_mem_less_64bytes): 350 cmp $0x20, %ecx 351 jb L(shl_0_mem_less_32bytes) 352 movdqa (%eax), %xmm0 353 sub $0x20, %ecx 354 movdqa 0x10(%eax), %xmm1 355 add $0x20, %eax 356 movdqa %xmm0, (%edx) 357 movdqa %xmm1, 0x10(%edx) 358 add $0x20, %edx 359 360L(shl_0_mem_less_32bytes): 361 cmp $0x10, %ecx 362 jb L(shl_0_mem_less_16bytes) 363 sub $0x10, %ecx 364 movdqa (%eax), %xmm0 365 add $0x10, %eax 366 movdqa %xmm0, (%edx) 367 add $0x10, %edx 368 369L(shl_0_mem_less_16bytes): 370 add %ecx, %edx 371 add %ecx, %eax 372 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 373 374 .p2align 4 375L(shl_1): 376# ifndef USE_AS_MEMMOVE 377 movaps -1(%eax), %xmm1 378# else 379 movl DEST+4(%esp), %edi 380 movaps -1(%eax), %xmm1 381 movdqu %xmm0, (%edi) 382# endif 383# ifdef DATA_CACHE_SIZE_HALF 384 cmp $DATA_CACHE_SIZE_HALF, %ecx 385# else 386# ifdef PIC 387 SETUP_PIC_REG(bx) 388 add $_GLOBAL_OFFSET_TABLE_, %ebx 389 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 390# else 391 cmp __x86_data_cache_size_half, %ecx 392# endif 393# endif 394 jb L(sh_1_no_prefetch) 395 396 lea -64(%ecx), %ecx 397 398 .p2align 4 399L(Shl1LoopStart): 400 prefetcht0 0x1c0(%eax) 401 prefetcht0 0x1c0(%edx) 402 movaps 15(%eax), %xmm2 403 movaps 31(%eax), %xmm3 404 movaps 47(%eax), %xmm4 405 movaps 63(%eax), %xmm5 406 movaps %xmm5, %xmm7 407 palignr $1, %xmm4, %xmm5 408 palignr $1, %xmm3, %xmm4 409 movaps %xmm5, 48(%edx) 410 palignr $1, %xmm2, %xmm3 411 lea 64(%eax), %eax 412 palignr $1, %xmm1, %xmm2 413 movaps %xmm4, 32(%edx) 414 movaps %xmm3, 16(%edx) 415 movaps %xmm7, %xmm1 416 movaps %xmm2, (%edx) 417 lea 64(%edx), %edx 418 sub $64, %ecx 419 ja L(Shl1LoopStart) 420 421L(Shl1LoopLeave): 422 add $32, %ecx 423 jle L(shl_end_0) 424 425 movaps 15(%eax), %xmm2 426 movaps 31(%eax), %xmm3 427 palignr $1, %xmm2, %xmm3 428 palignr $1, %xmm1, %xmm2 429 movaps %xmm2, (%edx) 430 movaps %xmm3, 16(%edx) 431 lea 32(%edx, %ecx), %edx 432 lea 32(%eax, %ecx), %eax 433 POP (%edi) 434 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 435 436 CFI_PUSH (%edi) 437 438 .p2align 4 439L(sh_1_no_prefetch): 440 lea -32(%ecx), %ecx 441 lea -1(%eax), %eax 442 xor %edi, %edi 443 444 .p2align 4 445L(sh_1_no_prefetch_loop): 446 movdqa 16(%eax, %edi), %xmm2 447 sub $32, %ecx 448 movdqa 32(%eax, %edi), %xmm3 449 movdqa %xmm3, %xmm4 450 palignr $1, %xmm2, %xmm3 451 palignr $1, %xmm1, %xmm2 452 lea 32(%edi), %edi 453 movdqa %xmm2, -32(%edx, %edi) 454 movdqa %xmm3, -16(%edx, %edi) 455 jb L(sh_1_end_no_prefetch_loop) 456 457 movdqa 16(%eax, %edi), %xmm2 458 sub $32, %ecx 459 movdqa 32(%eax, %edi), %xmm3 460 movdqa %xmm3, %xmm1 461 palignr $1, %xmm2, %xmm3 462 palignr $1, %xmm4, %xmm2 463 lea 32(%edi), %edi 464 movdqa %xmm2, -32(%edx, %edi) 465 movdqa %xmm3, -16(%edx, %edi) 466 jae L(sh_1_no_prefetch_loop) 467 468L(sh_1_end_no_prefetch_loop): 469 lea 32(%ecx), %ecx 470 add %ecx, %edi 471 add %edi, %edx 472 lea 1(%edi, %eax), %eax 473 POP (%edi) 474 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 475 476 CFI_PUSH (%edi) 477 478 .p2align 4 479L(shl_2): 480# ifndef USE_AS_MEMMOVE 481 movaps -2(%eax), %xmm1 482# else 483 movl DEST+4(%esp), %edi 484 movaps -2(%eax), %xmm1 485 movdqu %xmm0, (%edi) 486# endif 487# ifdef DATA_CACHE_SIZE_HALF 488 cmp $DATA_CACHE_SIZE_HALF, %ecx 489# else 490# ifdef PIC 491 SETUP_PIC_REG(bx) 492 add $_GLOBAL_OFFSET_TABLE_, %ebx 493 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 494# else 495 cmp __x86_data_cache_size_half, %ecx 496# endif 497# endif 498 jb L(sh_2_no_prefetch) 499 500 lea -64(%ecx), %ecx 501 502 .p2align 4 503L(Shl2LoopStart): 504 prefetcht0 0x1c0(%eax) 505 prefetcht0 0x1c0(%edx) 506 movaps 14(%eax), %xmm2 507 movaps 30(%eax), %xmm3 508 movaps 46(%eax), %xmm4 509 movaps 62(%eax), %xmm5 510 movaps %xmm5, %xmm7 511 palignr $2, %xmm4, %xmm5 512 palignr $2, %xmm3, %xmm4 513 movaps %xmm5, 48(%edx) 514 palignr $2, %xmm2, %xmm3 515 lea 64(%eax), %eax 516 palignr $2, %xmm1, %xmm2 517 movaps %xmm4, 32(%edx) 518 movaps %xmm3, 16(%edx) 519 movaps %xmm7, %xmm1 520 movaps %xmm2, (%edx) 521 lea 64(%edx), %edx 522 sub $64, %ecx 523 ja L(Shl2LoopStart) 524 525L(Shl2LoopLeave): 526 add $32, %ecx 527 jle L(shl_end_0) 528 529 movaps 14(%eax), %xmm2 530 movaps 30(%eax), %xmm3 531 palignr $2, %xmm2, %xmm3 532 palignr $2, %xmm1, %xmm2 533 movaps %xmm2, (%edx) 534 movaps %xmm3, 16(%edx) 535 lea 32(%edx, %ecx), %edx 536 lea 32(%eax, %ecx), %eax 537 POP (%edi) 538 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 539 540 CFI_PUSH (%edi) 541 542 .p2align 4 543L(sh_2_no_prefetch): 544 lea -32(%ecx), %ecx 545 lea -2(%eax), %eax 546 xor %edi, %edi 547 548 .p2align 4 549L(sh_2_no_prefetch_loop): 550 movdqa 16(%eax, %edi), %xmm2 551 sub $32, %ecx 552 movdqa 32(%eax, %edi), %xmm3 553 movdqa %xmm3, %xmm4 554 palignr $2, %xmm2, %xmm3 555 palignr $2, %xmm1, %xmm2 556 lea 32(%edi), %edi 557 movdqa %xmm2, -32(%edx, %edi) 558 movdqa %xmm3, -16(%edx, %edi) 559 jb L(sh_2_end_no_prefetch_loop) 560 561 movdqa 16(%eax, %edi), %xmm2 562 sub $32, %ecx 563 movdqa 32(%eax, %edi), %xmm3 564 movdqa %xmm3, %xmm1 565 palignr $2, %xmm2, %xmm3 566 palignr $2, %xmm4, %xmm2 567 lea 32(%edi), %edi 568 movdqa %xmm2, -32(%edx, %edi) 569 movdqa %xmm3, -16(%edx, %edi) 570 jae L(sh_2_no_prefetch_loop) 571 572L(sh_2_end_no_prefetch_loop): 573 lea 32(%ecx), %ecx 574 add %ecx, %edi 575 add %edi, %edx 576 lea 2(%edi, %eax), %eax 577 POP (%edi) 578 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 579 580 CFI_PUSH (%edi) 581 582 .p2align 4 583L(shl_3): 584# ifndef USE_AS_MEMMOVE 585 movaps -3(%eax), %xmm1 586# else 587 movl DEST+4(%esp), %edi 588 movaps -3(%eax), %xmm1 589 movdqu %xmm0, (%edi) 590# endif 591# ifdef DATA_CACHE_SIZE_HALF 592 cmp $DATA_CACHE_SIZE_HALF, %ecx 593# else 594# ifdef PIC 595 SETUP_PIC_REG(bx) 596 add $_GLOBAL_OFFSET_TABLE_, %ebx 597 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 598# else 599 cmp __x86_data_cache_size_half, %ecx 600# endif 601# endif 602 jb L(sh_3_no_prefetch) 603 604 lea -64(%ecx), %ecx 605 606 .p2align 4 607L(Shl3LoopStart): 608 prefetcht0 0x1c0(%eax) 609 prefetcht0 0x1c0(%edx) 610 movaps 13(%eax), %xmm2 611 movaps 29(%eax), %xmm3 612 movaps 45(%eax), %xmm4 613 movaps 61(%eax), %xmm5 614 movaps %xmm5, %xmm7 615 palignr $3, %xmm4, %xmm5 616 palignr $3, %xmm3, %xmm4 617 movaps %xmm5, 48(%edx) 618 palignr $3, %xmm2, %xmm3 619 lea 64(%eax), %eax 620 palignr $3, %xmm1, %xmm2 621 movaps %xmm4, 32(%edx) 622 movaps %xmm3, 16(%edx) 623 movaps %xmm7, %xmm1 624 movaps %xmm2, (%edx) 625 lea 64(%edx), %edx 626 sub $64, %ecx 627 ja L(Shl3LoopStart) 628 629L(Shl3LoopLeave): 630 add $32, %ecx 631 jle L(shl_end_0) 632 633 movaps 13(%eax), %xmm2 634 movaps 29(%eax), %xmm3 635 palignr $3, %xmm2, %xmm3 636 palignr $3, %xmm1, %xmm2 637 movaps %xmm2, (%edx) 638 movaps %xmm3, 16(%edx) 639 lea 32(%edx, %ecx), %edx 640 lea 32(%eax, %ecx), %eax 641 POP (%edi) 642 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 643 644 CFI_PUSH (%edi) 645 646 .p2align 4 647L(sh_3_no_prefetch): 648 lea -32(%ecx), %ecx 649 lea -3(%eax), %eax 650 xor %edi, %edi 651 652 .p2align 4 653L(sh_3_no_prefetch_loop): 654 movdqa 16(%eax, %edi), %xmm2 655 sub $32, %ecx 656 movdqa 32(%eax, %edi), %xmm3 657 movdqa %xmm3, %xmm4 658 palignr $3, %xmm2, %xmm3 659 palignr $3, %xmm1, %xmm2 660 lea 32(%edi), %edi 661 movdqa %xmm2, -32(%edx, %edi) 662 movdqa %xmm3, -16(%edx, %edi) 663 664 jb L(sh_3_end_no_prefetch_loop) 665 666 movdqa 16(%eax, %edi), %xmm2 667 sub $32, %ecx 668 movdqa 32(%eax, %edi), %xmm3 669 movdqa %xmm3, %xmm1 670 palignr $3, %xmm2, %xmm3 671 palignr $3, %xmm4, %xmm2 672 lea 32(%edi), %edi 673 movdqa %xmm2, -32(%edx, %edi) 674 movdqa %xmm3, -16(%edx, %edi) 675 676 jae L(sh_3_no_prefetch_loop) 677 678L(sh_3_end_no_prefetch_loop): 679 lea 32(%ecx), %ecx 680 add %ecx, %edi 681 add %edi, %edx 682 lea 3(%edi, %eax), %eax 683 POP (%edi) 684 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 685 686 CFI_PUSH (%edi) 687 688 .p2align 4 689L(shl_4): 690# ifndef USE_AS_MEMMOVE 691 movaps -4(%eax), %xmm1 692# else 693 movl DEST+4(%esp), %edi 694 movaps -4(%eax), %xmm1 695 movdqu %xmm0, (%edi) 696# endif 697# ifdef DATA_CACHE_SIZE_HALF 698 cmp $DATA_CACHE_SIZE_HALF, %ecx 699# else 700# ifdef PIC 701 SETUP_PIC_REG(bx) 702 add $_GLOBAL_OFFSET_TABLE_, %ebx 703 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 704# else 705 cmp __x86_data_cache_size_half, %ecx 706# endif 707# endif 708 jb L(sh_4_no_prefetch) 709 710 lea -64(%ecx), %ecx 711 712 .p2align 4 713L(Shl4LoopStart): 714 prefetcht0 0x1c0(%eax) 715 prefetcht0 0x1c0(%edx) 716 movaps 12(%eax), %xmm2 717 movaps 28(%eax), %xmm3 718 movaps 44(%eax), %xmm4 719 movaps 60(%eax), %xmm5 720 movaps %xmm5, %xmm7 721 palignr $4, %xmm4, %xmm5 722 palignr $4, %xmm3, %xmm4 723 movaps %xmm5, 48(%edx) 724 palignr $4, %xmm2, %xmm3 725 lea 64(%eax), %eax 726 palignr $4, %xmm1, %xmm2 727 movaps %xmm4, 32(%edx) 728 movaps %xmm3, 16(%edx) 729 movaps %xmm7, %xmm1 730 movaps %xmm2, (%edx) 731 lea 64(%edx), %edx 732 sub $64, %ecx 733 ja L(Shl4LoopStart) 734 735L(Shl4LoopLeave): 736 add $32, %ecx 737 jle L(shl_end_0) 738 739 movaps 12(%eax), %xmm2 740 movaps 28(%eax), %xmm3 741 palignr $4, %xmm2, %xmm3 742 palignr $4, %xmm1, %xmm2 743 movaps %xmm2, (%edx) 744 movaps %xmm3, 16(%edx) 745 lea 32(%edx, %ecx), %edx 746 lea 32(%eax, %ecx), %eax 747 POP (%edi) 748 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 749 750 CFI_PUSH (%edi) 751 752 .p2align 4 753L(sh_4_no_prefetch): 754 lea -32(%ecx), %ecx 755 lea -4(%eax), %eax 756 xor %edi, %edi 757 758 .p2align 4 759L(sh_4_no_prefetch_loop): 760 movdqa 16(%eax, %edi), %xmm2 761 sub $32, %ecx 762 movdqa 32(%eax, %edi), %xmm3 763 movdqa %xmm3, %xmm4 764 palignr $4, %xmm2, %xmm3 765 palignr $4, %xmm1, %xmm2 766 lea 32(%edi), %edi 767 movdqa %xmm2, -32(%edx, %edi) 768 movdqa %xmm3, -16(%edx, %edi) 769 770 jb L(sh_4_end_no_prefetch_loop) 771 772 movdqa 16(%eax, %edi), %xmm2 773 sub $32, %ecx 774 movdqa 32(%eax, %edi), %xmm3 775 movdqa %xmm3, %xmm1 776 palignr $4, %xmm2, %xmm3 777 palignr $4, %xmm4, %xmm2 778 lea 32(%edi), %edi 779 movdqa %xmm2, -32(%edx, %edi) 780 movdqa %xmm3, -16(%edx, %edi) 781 782 jae L(sh_4_no_prefetch_loop) 783 784L(sh_4_end_no_prefetch_loop): 785 lea 32(%ecx), %ecx 786 add %ecx, %edi 787 add %edi, %edx 788 lea 4(%edi, %eax), %eax 789 POP (%edi) 790 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 791 792 CFI_PUSH (%edi) 793 794 .p2align 4 795L(shl_5): 796# ifndef USE_AS_MEMMOVE 797 movaps -5(%eax), %xmm1 798# else 799 movl DEST+4(%esp), %edi 800 movaps -5(%eax), %xmm1 801 movdqu %xmm0, (%edi) 802# endif 803# ifdef DATA_CACHE_SIZE_HALF 804 cmp $DATA_CACHE_SIZE_HALF, %ecx 805# else 806# ifdef PIC 807 SETUP_PIC_REG(bx) 808 add $_GLOBAL_OFFSET_TABLE_, %ebx 809 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 810# else 811 cmp __x86_data_cache_size_half, %ecx 812# endif 813# endif 814 jb L(sh_5_no_prefetch) 815 816 lea -64(%ecx), %ecx 817 818 .p2align 4 819L(Shl5LoopStart): 820 prefetcht0 0x1c0(%eax) 821 prefetcht0 0x1c0(%edx) 822 movaps 11(%eax), %xmm2 823 movaps 27(%eax), %xmm3 824 movaps 43(%eax), %xmm4 825 movaps 59(%eax), %xmm5 826 movaps %xmm5, %xmm7 827 palignr $5, %xmm4, %xmm5 828 palignr $5, %xmm3, %xmm4 829 movaps %xmm5, 48(%edx) 830 palignr $5, %xmm2, %xmm3 831 lea 64(%eax), %eax 832 palignr $5, %xmm1, %xmm2 833 movaps %xmm4, 32(%edx) 834 movaps %xmm3, 16(%edx) 835 movaps %xmm7, %xmm1 836 movaps %xmm2, (%edx) 837 lea 64(%edx), %edx 838 sub $64, %ecx 839 ja L(Shl5LoopStart) 840 841L(Shl5LoopLeave): 842 add $32, %ecx 843 jle L(shl_end_0) 844 845 movaps 11(%eax), %xmm2 846 movaps 27(%eax), %xmm3 847 palignr $5, %xmm2, %xmm3 848 palignr $5, %xmm1, %xmm2 849 movaps %xmm2, (%edx) 850 movaps %xmm3, 16(%edx) 851 lea 32(%edx, %ecx), %edx 852 lea 32(%eax, %ecx), %eax 853 POP (%edi) 854 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 855 856 CFI_PUSH (%edi) 857 858 .p2align 4 859L(sh_5_no_prefetch): 860 lea -32(%ecx), %ecx 861 lea -5(%eax), %eax 862 xor %edi, %edi 863 864 .p2align 4 865L(sh_5_no_prefetch_loop): 866 movdqa 16(%eax, %edi), %xmm2 867 sub $32, %ecx 868 movdqa 32(%eax, %edi), %xmm3 869 movdqa %xmm3, %xmm4 870 palignr $5, %xmm2, %xmm3 871 palignr $5, %xmm1, %xmm2 872 lea 32(%edi), %edi 873 movdqa %xmm2, -32(%edx, %edi) 874 movdqa %xmm3, -16(%edx, %edi) 875 876 jb L(sh_5_end_no_prefetch_loop) 877 878 movdqa 16(%eax, %edi), %xmm2 879 sub $32, %ecx 880 movdqa 32(%eax, %edi), %xmm3 881 movdqa %xmm3, %xmm1 882 palignr $5, %xmm2, %xmm3 883 palignr $5, %xmm4, %xmm2 884 lea 32(%edi), %edi 885 movdqa %xmm2, -32(%edx, %edi) 886 movdqa %xmm3, -16(%edx, %edi) 887 888 jae L(sh_5_no_prefetch_loop) 889 890L(sh_5_end_no_prefetch_loop): 891 lea 32(%ecx), %ecx 892 add %ecx, %edi 893 add %edi, %edx 894 lea 5(%edi, %eax), %eax 895 POP (%edi) 896 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 897 898 CFI_PUSH (%edi) 899 900 .p2align 4 901L(shl_6): 902# ifndef USE_AS_MEMMOVE 903 movaps -6(%eax), %xmm1 904# else 905 movl DEST+4(%esp), %edi 906 movaps -6(%eax), %xmm1 907 movdqu %xmm0, (%edi) 908# endif 909# ifdef DATA_CACHE_SIZE_HALF 910 cmp $DATA_CACHE_SIZE_HALF, %ecx 911# else 912# ifdef PIC 913 SETUP_PIC_REG(bx) 914 add $_GLOBAL_OFFSET_TABLE_, %ebx 915 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 916# else 917 cmp __x86_data_cache_size_half, %ecx 918# endif 919# endif 920 jb L(sh_6_no_prefetch) 921 922 lea -64(%ecx), %ecx 923 924 .p2align 4 925L(Shl6LoopStart): 926 prefetcht0 0x1c0(%eax) 927 prefetcht0 0x1c0(%edx) 928 movaps 10(%eax), %xmm2 929 movaps 26(%eax), %xmm3 930 movaps 42(%eax), %xmm4 931 movaps 58(%eax), %xmm5 932 movaps %xmm5, %xmm7 933 palignr $6, %xmm4, %xmm5 934 palignr $6, %xmm3, %xmm4 935 movaps %xmm5, 48(%edx) 936 palignr $6, %xmm2, %xmm3 937 lea 64(%eax), %eax 938 palignr $6, %xmm1, %xmm2 939 movaps %xmm4, 32(%edx) 940 movaps %xmm3, 16(%edx) 941 movaps %xmm7, %xmm1 942 movaps %xmm2, (%edx) 943 lea 64(%edx), %edx 944 sub $64, %ecx 945 ja L(Shl6LoopStart) 946 947L(Shl6LoopLeave): 948 add $32, %ecx 949 jle L(shl_end_0) 950 951 movaps 10(%eax), %xmm2 952 movaps 26(%eax), %xmm3 953 palignr $6, %xmm2, %xmm3 954 palignr $6, %xmm1, %xmm2 955 movaps %xmm2, (%edx) 956 movaps %xmm3, 16(%edx) 957 lea 32(%edx, %ecx), %edx 958 lea 32(%eax, %ecx), %eax 959 POP (%edi) 960 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 961 962 CFI_PUSH (%edi) 963 964 .p2align 4 965L(sh_6_no_prefetch): 966 lea -32(%ecx), %ecx 967 lea -6(%eax), %eax 968 xor %edi, %edi 969 970 .p2align 4 971L(sh_6_no_prefetch_loop): 972 movdqa 16(%eax, %edi), %xmm2 973 sub $32, %ecx 974 movdqa 32(%eax, %edi), %xmm3 975 movdqa %xmm3, %xmm4 976 palignr $6, %xmm2, %xmm3 977 palignr $6, %xmm1, %xmm2 978 lea 32(%edi), %edi 979 movdqa %xmm2, -32(%edx, %edi) 980 movdqa %xmm3, -16(%edx, %edi) 981 982 jb L(sh_6_end_no_prefetch_loop) 983 984 movdqa 16(%eax, %edi), %xmm2 985 sub $32, %ecx 986 movdqa 32(%eax, %edi), %xmm3 987 movdqa %xmm3, %xmm1 988 palignr $6, %xmm2, %xmm3 989 palignr $6, %xmm4, %xmm2 990 lea 32(%edi), %edi 991 movdqa %xmm2, -32(%edx, %edi) 992 movdqa %xmm3, -16(%edx, %edi) 993 994 jae L(sh_6_no_prefetch_loop) 995 996L(sh_6_end_no_prefetch_loop): 997 lea 32(%ecx), %ecx 998 add %ecx, %edi 999 add %edi, %edx 1000 lea 6(%edi, %eax), %eax 1001 POP (%edi) 1002 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1003 1004 CFI_PUSH (%edi) 1005 1006 .p2align 4 1007L(shl_7): 1008# ifndef USE_AS_MEMMOVE 1009 movaps -7(%eax), %xmm1 1010# else 1011 movl DEST+4(%esp), %edi 1012 movaps -7(%eax), %xmm1 1013 movdqu %xmm0, (%edi) 1014# endif 1015# ifdef DATA_CACHE_SIZE_HALF 1016 cmp $DATA_CACHE_SIZE_HALF, %ecx 1017# else 1018# ifdef PIC 1019 SETUP_PIC_REG(bx) 1020 add $_GLOBAL_OFFSET_TABLE_, %ebx 1021 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1022# else 1023 cmp __x86_data_cache_size_half, %ecx 1024# endif 1025# endif 1026 jb L(sh_7_no_prefetch) 1027 1028 lea -64(%ecx), %ecx 1029 1030 .p2align 4 1031L(Shl7LoopStart): 1032 prefetcht0 0x1c0(%eax) 1033 prefetcht0 0x1c0(%edx) 1034 movaps 9(%eax), %xmm2 1035 movaps 25(%eax), %xmm3 1036 movaps 41(%eax), %xmm4 1037 movaps 57(%eax), %xmm5 1038 movaps %xmm5, %xmm7 1039 palignr $7, %xmm4, %xmm5 1040 palignr $7, %xmm3, %xmm4 1041 movaps %xmm5, 48(%edx) 1042 palignr $7, %xmm2, %xmm3 1043 lea 64(%eax), %eax 1044 palignr $7, %xmm1, %xmm2 1045 movaps %xmm4, 32(%edx) 1046 movaps %xmm3, 16(%edx) 1047 movaps %xmm7, %xmm1 1048 movaps %xmm2, (%edx) 1049 lea 64(%edx), %edx 1050 sub $64, %ecx 1051 ja L(Shl7LoopStart) 1052 1053L(Shl7LoopLeave): 1054 add $32, %ecx 1055 jle L(shl_end_0) 1056 1057 movaps 9(%eax), %xmm2 1058 movaps 25(%eax), %xmm3 1059 palignr $7, %xmm2, %xmm3 1060 palignr $7, %xmm1, %xmm2 1061 movaps %xmm2, (%edx) 1062 movaps %xmm3, 16(%edx) 1063 lea 32(%edx, %ecx), %edx 1064 lea 32(%eax, %ecx), %eax 1065 POP (%edi) 1066 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1067 1068 CFI_PUSH (%edi) 1069 1070 .p2align 4 1071L(sh_7_no_prefetch): 1072 lea -32(%ecx), %ecx 1073 lea -7(%eax), %eax 1074 xor %edi, %edi 1075 1076 .p2align 4 1077L(sh_7_no_prefetch_loop): 1078 movdqa 16(%eax, %edi), %xmm2 1079 sub $32, %ecx 1080 movdqa 32(%eax, %edi), %xmm3 1081 movdqa %xmm3, %xmm4 1082 palignr $7, %xmm2, %xmm3 1083 palignr $7, %xmm1, %xmm2 1084 lea 32(%edi), %edi 1085 movdqa %xmm2, -32(%edx, %edi) 1086 movdqa %xmm3, -16(%edx, %edi) 1087 jb L(sh_7_end_no_prefetch_loop) 1088 1089 movdqa 16(%eax, %edi), %xmm2 1090 sub $32, %ecx 1091 movdqa 32(%eax, %edi), %xmm3 1092 movdqa %xmm3, %xmm1 1093 palignr $7, %xmm2, %xmm3 1094 palignr $7, %xmm4, %xmm2 1095 lea 32(%edi), %edi 1096 movdqa %xmm2, -32(%edx, %edi) 1097 movdqa %xmm3, -16(%edx, %edi) 1098 jae L(sh_7_no_prefetch_loop) 1099 1100L(sh_7_end_no_prefetch_loop): 1101 lea 32(%ecx), %ecx 1102 add %ecx, %edi 1103 add %edi, %edx 1104 lea 7(%edi, %eax), %eax 1105 POP (%edi) 1106 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1107 1108 CFI_PUSH (%edi) 1109 1110 .p2align 4 1111L(shl_8): 1112# ifndef USE_AS_MEMMOVE 1113 movaps -8(%eax), %xmm1 1114# else 1115 movl DEST+4(%esp), %edi 1116 movaps -8(%eax), %xmm1 1117 movdqu %xmm0, (%edi) 1118# endif 1119# ifdef DATA_CACHE_SIZE_HALF 1120 cmp $DATA_CACHE_SIZE_HALF, %ecx 1121# else 1122# ifdef PIC 1123 SETUP_PIC_REG(bx) 1124 add $_GLOBAL_OFFSET_TABLE_, %ebx 1125 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1126# else 1127 cmp __x86_data_cache_size_half, %ecx 1128# endif 1129# endif 1130 jb L(sh_8_no_prefetch) 1131 1132 lea -64(%ecx), %ecx 1133 1134 .p2align 4 1135L(Shl8LoopStart): 1136 prefetcht0 0x1c0(%eax) 1137 prefetcht0 0x1c0(%edx) 1138 movaps 8(%eax), %xmm2 1139 movaps 24(%eax), %xmm3 1140 movaps 40(%eax), %xmm4 1141 movaps 56(%eax), %xmm5 1142 movaps %xmm5, %xmm7 1143 palignr $8, %xmm4, %xmm5 1144 palignr $8, %xmm3, %xmm4 1145 movaps %xmm5, 48(%edx) 1146 palignr $8, %xmm2, %xmm3 1147 lea 64(%eax), %eax 1148 palignr $8, %xmm1, %xmm2 1149 movaps %xmm4, 32(%edx) 1150 movaps %xmm3, 16(%edx) 1151 movaps %xmm7, %xmm1 1152 movaps %xmm2, (%edx) 1153 lea 64(%edx), %edx 1154 sub $64, %ecx 1155 ja L(Shl8LoopStart) 1156 1157L(LoopLeave8): 1158 add $32, %ecx 1159 jle L(shl_end_0) 1160 1161 movaps 8(%eax), %xmm2 1162 movaps 24(%eax), %xmm3 1163 palignr $8, %xmm2, %xmm3 1164 palignr $8, %xmm1, %xmm2 1165 movaps %xmm2, (%edx) 1166 movaps %xmm3, 16(%edx) 1167 lea 32(%edx, %ecx), %edx 1168 lea 32(%eax, %ecx), %eax 1169 POP (%edi) 1170 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1171 1172 CFI_PUSH (%edi) 1173 1174 .p2align 4 1175L(sh_8_no_prefetch): 1176 lea -32(%ecx), %ecx 1177 lea -8(%eax), %eax 1178 xor %edi, %edi 1179 1180 .p2align 4 1181L(sh_8_no_prefetch_loop): 1182 movdqa 16(%eax, %edi), %xmm2 1183 sub $32, %ecx 1184 movdqa 32(%eax, %edi), %xmm3 1185 movdqa %xmm3, %xmm4 1186 palignr $8, %xmm2, %xmm3 1187 palignr $8, %xmm1, %xmm2 1188 lea 32(%edi), %edi 1189 movdqa %xmm2, -32(%edx, %edi) 1190 movdqa %xmm3, -16(%edx, %edi) 1191 jb L(sh_8_end_no_prefetch_loop) 1192 1193 movdqa 16(%eax, %edi), %xmm2 1194 sub $32, %ecx 1195 movdqa 32(%eax, %edi), %xmm3 1196 movdqa %xmm3, %xmm1 1197 palignr $8, %xmm2, %xmm3 1198 palignr $8, %xmm4, %xmm2 1199 lea 32(%edi), %edi 1200 movdqa %xmm2, -32(%edx, %edi) 1201 movdqa %xmm3, -16(%edx, %edi) 1202 jae L(sh_8_no_prefetch_loop) 1203 1204L(sh_8_end_no_prefetch_loop): 1205 lea 32(%ecx), %ecx 1206 add %ecx, %edi 1207 add %edi, %edx 1208 lea 8(%edi, %eax), %eax 1209 POP (%edi) 1210 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1211 1212 CFI_PUSH (%edi) 1213 1214 .p2align 4 1215L(shl_9): 1216# ifndef USE_AS_MEMMOVE 1217 movaps -9(%eax), %xmm1 1218# else 1219 movl DEST+4(%esp), %edi 1220 movaps -9(%eax), %xmm1 1221 movdqu %xmm0, (%edi) 1222# endif 1223# ifdef DATA_CACHE_SIZE_HALF 1224 cmp $DATA_CACHE_SIZE_HALF, %ecx 1225# else 1226# ifdef PIC 1227 SETUP_PIC_REG(bx) 1228 add $_GLOBAL_OFFSET_TABLE_, %ebx 1229 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1230# else 1231 cmp __x86_data_cache_size_half, %ecx 1232# endif 1233# endif 1234 jb L(sh_9_no_prefetch) 1235 1236 lea -64(%ecx), %ecx 1237 1238 .p2align 4 1239L(Shl9LoopStart): 1240 prefetcht0 0x1c0(%eax) 1241 prefetcht0 0x1c0(%edx) 1242 movaps 7(%eax), %xmm2 1243 movaps 23(%eax), %xmm3 1244 movaps 39(%eax), %xmm4 1245 movaps 55(%eax), %xmm5 1246 movaps %xmm5, %xmm7 1247 palignr $9, %xmm4, %xmm5 1248 palignr $9, %xmm3, %xmm4 1249 movaps %xmm5, 48(%edx) 1250 palignr $9, %xmm2, %xmm3 1251 lea 64(%eax), %eax 1252 palignr $9, %xmm1, %xmm2 1253 movaps %xmm4, 32(%edx) 1254 movaps %xmm3, 16(%edx) 1255 movaps %xmm7, %xmm1 1256 movaps %xmm2, (%edx) 1257 lea 64(%edx), %edx 1258 sub $64, %ecx 1259 ja L(Shl9LoopStart) 1260 1261L(Shl9LoopLeave): 1262 add $32, %ecx 1263 jle L(shl_end_0) 1264 1265 movaps 7(%eax), %xmm2 1266 movaps 23(%eax), %xmm3 1267 palignr $9, %xmm2, %xmm3 1268 palignr $9, %xmm1, %xmm2 1269 1270 movaps %xmm2, (%edx) 1271 movaps %xmm3, 16(%edx) 1272 lea 32(%edx, %ecx), %edx 1273 lea 32(%eax, %ecx), %eax 1274 POP (%edi) 1275 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1276 1277 CFI_PUSH (%edi) 1278 1279 .p2align 4 1280L(sh_9_no_prefetch): 1281 lea -32(%ecx), %ecx 1282 lea -9(%eax), %eax 1283 xor %edi, %edi 1284 1285 .p2align 4 1286L(sh_9_no_prefetch_loop): 1287 movdqa 16(%eax, %edi), %xmm2 1288 sub $32, %ecx 1289 movdqa 32(%eax, %edi), %xmm3 1290 movdqa %xmm3, %xmm4 1291 palignr $9, %xmm2, %xmm3 1292 palignr $9, %xmm1, %xmm2 1293 lea 32(%edi), %edi 1294 movdqa %xmm2, -32(%edx, %edi) 1295 movdqa %xmm3, -16(%edx, %edi) 1296 jb L(sh_9_end_no_prefetch_loop) 1297 1298 movdqa 16(%eax, %edi), %xmm2 1299 sub $32, %ecx 1300 movdqa 32(%eax, %edi), %xmm3 1301 movdqa %xmm3, %xmm1 1302 palignr $9, %xmm2, %xmm3 1303 palignr $9, %xmm4, %xmm2 1304 lea 32(%edi), %edi 1305 movdqa %xmm2, -32(%edx, %edi) 1306 movdqa %xmm3, -16(%edx, %edi) 1307 jae L(sh_9_no_prefetch_loop) 1308 1309L(sh_9_end_no_prefetch_loop): 1310 lea 32(%ecx), %ecx 1311 add %ecx, %edi 1312 add %edi, %edx 1313 lea 9(%edi, %eax), %eax 1314 POP (%edi) 1315 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1316 1317 CFI_PUSH (%edi) 1318 1319 .p2align 4 1320L(shl_10): 1321# ifndef USE_AS_MEMMOVE 1322 movaps -10(%eax), %xmm1 1323# else 1324 movl DEST+4(%esp), %edi 1325 movaps -10(%eax), %xmm1 1326 movdqu %xmm0, (%edi) 1327# endif 1328# ifdef DATA_CACHE_SIZE_HALF 1329 cmp $DATA_CACHE_SIZE_HALF, %ecx 1330# else 1331# ifdef PIC 1332 SETUP_PIC_REG(bx) 1333 add $_GLOBAL_OFFSET_TABLE_, %ebx 1334 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1335# else 1336 cmp __x86_data_cache_size_half, %ecx 1337# endif 1338# endif 1339 jb L(sh_10_no_prefetch) 1340 1341 lea -64(%ecx), %ecx 1342 1343 .p2align 4 1344L(Shl10LoopStart): 1345 prefetcht0 0x1c0(%eax) 1346 prefetcht0 0x1c0(%edx) 1347 movaps 6(%eax), %xmm2 1348 movaps 22(%eax), %xmm3 1349 movaps 38(%eax), %xmm4 1350 movaps 54(%eax), %xmm5 1351 movaps %xmm5, %xmm7 1352 palignr $10, %xmm4, %xmm5 1353 palignr $10, %xmm3, %xmm4 1354 movaps %xmm5, 48(%edx) 1355 palignr $10, %xmm2, %xmm3 1356 lea 64(%eax), %eax 1357 palignr $10, %xmm1, %xmm2 1358 movaps %xmm4, 32(%edx) 1359 movaps %xmm3, 16(%edx) 1360 movaps %xmm7, %xmm1 1361 movaps %xmm2, (%edx) 1362 lea 64(%edx), %edx 1363 sub $64, %ecx 1364 ja L(Shl10LoopStart) 1365 1366L(Shl10LoopLeave): 1367 add $32, %ecx 1368 jle L(shl_end_0) 1369 1370 movaps 6(%eax), %xmm2 1371 movaps 22(%eax), %xmm3 1372 palignr $10, %xmm2, %xmm3 1373 palignr $10, %xmm1, %xmm2 1374 1375 movaps %xmm2, (%edx) 1376 movaps %xmm3, 16(%edx) 1377 lea 32(%edx, %ecx), %edx 1378 lea 32(%eax, %ecx), %eax 1379 POP (%edi) 1380 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1381 1382 CFI_PUSH (%edi) 1383 1384 .p2align 4 1385L(sh_10_no_prefetch): 1386 lea -32(%ecx), %ecx 1387 lea -10(%eax), %eax 1388 xor %edi, %edi 1389 1390 .p2align 4 1391L(sh_10_no_prefetch_loop): 1392 movdqa 16(%eax, %edi), %xmm2 1393 sub $32, %ecx 1394 movdqa 32(%eax, %edi), %xmm3 1395 movdqa %xmm3, %xmm4 1396 palignr $10, %xmm2, %xmm3 1397 palignr $10, %xmm1, %xmm2 1398 lea 32(%edi), %edi 1399 movdqa %xmm2, -32(%edx, %edi) 1400 movdqa %xmm3, -16(%edx, %edi) 1401 jb L(sh_10_end_no_prefetch_loop) 1402 1403 movdqa 16(%eax, %edi), %xmm2 1404 sub $32, %ecx 1405 movdqa 32(%eax, %edi), %xmm3 1406 movdqa %xmm3, %xmm1 1407 palignr $10, %xmm2, %xmm3 1408 palignr $10, %xmm4, %xmm2 1409 lea 32(%edi), %edi 1410 movdqa %xmm2, -32(%edx, %edi) 1411 movdqa %xmm3, -16(%edx, %edi) 1412 jae L(sh_10_no_prefetch_loop) 1413 1414L(sh_10_end_no_prefetch_loop): 1415 lea 32(%ecx), %ecx 1416 add %ecx, %edi 1417 add %edi, %edx 1418 lea 10(%edi, %eax), %eax 1419 POP (%edi) 1420 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1421 1422 CFI_PUSH (%edi) 1423 1424 .p2align 4 1425L(shl_11): 1426# ifndef USE_AS_MEMMOVE 1427 movaps -11(%eax), %xmm1 1428# else 1429 movl DEST+4(%esp), %edi 1430 movaps -11(%eax), %xmm1 1431 movdqu %xmm0, (%edi) 1432# endif 1433# ifdef DATA_CACHE_SIZE_HALF 1434 cmp $DATA_CACHE_SIZE_HALF, %ecx 1435# else 1436# ifdef PIC 1437 SETUP_PIC_REG(bx) 1438 add $_GLOBAL_OFFSET_TABLE_, %ebx 1439 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1440# else 1441 cmp __x86_data_cache_size_half, %ecx 1442# endif 1443# endif 1444 jb L(sh_11_no_prefetch) 1445 1446 lea -64(%ecx), %ecx 1447 1448 .p2align 4 1449L(Shl11LoopStart): 1450 prefetcht0 0x1c0(%eax) 1451 prefetcht0 0x1c0(%edx) 1452 movaps 5(%eax), %xmm2 1453 movaps 21(%eax), %xmm3 1454 movaps 37(%eax), %xmm4 1455 movaps 53(%eax), %xmm5 1456 movaps %xmm5, %xmm7 1457 palignr $11, %xmm4, %xmm5 1458 palignr $11, %xmm3, %xmm4 1459 movaps %xmm5, 48(%edx) 1460 palignr $11, %xmm2, %xmm3 1461 lea 64(%eax), %eax 1462 palignr $11, %xmm1, %xmm2 1463 movaps %xmm4, 32(%edx) 1464 movaps %xmm3, 16(%edx) 1465 movaps %xmm7, %xmm1 1466 movaps %xmm2, (%edx) 1467 lea 64(%edx), %edx 1468 sub $64, %ecx 1469 ja L(Shl11LoopStart) 1470 1471L(Shl11LoopLeave): 1472 add $32, %ecx 1473 jle L(shl_end_0) 1474 1475 movaps 5(%eax), %xmm2 1476 movaps 21(%eax), %xmm3 1477 palignr $11, %xmm2, %xmm3 1478 palignr $11, %xmm1, %xmm2 1479 1480 movaps %xmm2, (%edx) 1481 movaps %xmm3, 16(%edx) 1482 lea 32(%edx, %ecx), %edx 1483 lea 32(%eax, %ecx), %eax 1484 POP (%edi) 1485 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1486 1487 CFI_PUSH (%edi) 1488 1489 .p2align 4 1490L(sh_11_no_prefetch): 1491 lea -32(%ecx), %ecx 1492 lea -11(%eax), %eax 1493 xor %edi, %edi 1494 1495 .p2align 4 1496L(sh_11_no_prefetch_loop): 1497 movdqa 16(%eax, %edi), %xmm2 1498 sub $32, %ecx 1499 movdqa 32(%eax, %edi), %xmm3 1500 movdqa %xmm3, %xmm4 1501 palignr $11, %xmm2, %xmm3 1502 palignr $11, %xmm1, %xmm2 1503 lea 32(%edi), %edi 1504 movdqa %xmm2, -32(%edx, %edi) 1505 movdqa %xmm3, -16(%edx, %edi) 1506 jb L(sh_11_end_no_prefetch_loop) 1507 1508 movdqa 16(%eax, %edi), %xmm2 1509 sub $32, %ecx 1510 movdqa 32(%eax, %edi), %xmm3 1511 movdqa %xmm3, %xmm1 1512 palignr $11, %xmm2, %xmm3 1513 palignr $11, %xmm4, %xmm2 1514 lea 32(%edi), %edi 1515 movdqa %xmm2, -32(%edx, %edi) 1516 movdqa %xmm3, -16(%edx, %edi) 1517 jae L(sh_11_no_prefetch_loop) 1518 1519L(sh_11_end_no_prefetch_loop): 1520 lea 32(%ecx), %ecx 1521 add %ecx, %edi 1522 add %edi, %edx 1523 lea 11(%edi, %eax), %eax 1524 POP (%edi) 1525 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1526 1527 CFI_PUSH (%edi) 1528 1529 .p2align 4 1530L(shl_12): 1531# ifndef USE_AS_MEMMOVE 1532 movaps -12(%eax), %xmm1 1533# else 1534 movl DEST+4(%esp), %edi 1535 movaps -12(%eax), %xmm1 1536 movdqu %xmm0, (%edi) 1537# endif 1538# ifdef DATA_CACHE_SIZE_HALF 1539 cmp $DATA_CACHE_SIZE_HALF, %ecx 1540# else 1541# ifdef PIC 1542 SETUP_PIC_REG(bx) 1543 add $_GLOBAL_OFFSET_TABLE_, %ebx 1544 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1545# else 1546 cmp __x86_data_cache_size_half, %ecx 1547# endif 1548# endif 1549 jb L(sh_12_no_prefetch) 1550 1551 lea -64(%ecx), %ecx 1552 1553 .p2align 4 1554L(Shl12LoopStart): 1555 prefetcht0 0x1c0(%eax) 1556 prefetcht0 0x1c0(%edx) 1557 movaps 4(%eax), %xmm2 1558 movaps 20(%eax), %xmm3 1559 movaps 36(%eax), %xmm4 1560 movaps 52(%eax), %xmm5 1561 movaps %xmm5, %xmm7 1562 palignr $12, %xmm4, %xmm5 1563 palignr $12, %xmm3, %xmm4 1564 movaps %xmm5, 48(%edx) 1565 palignr $12, %xmm2, %xmm3 1566 lea 64(%eax), %eax 1567 palignr $12, %xmm1, %xmm2 1568 movaps %xmm4, 32(%edx) 1569 movaps %xmm3, 16(%edx) 1570 movaps %xmm7, %xmm1 1571 movaps %xmm2, (%edx) 1572 lea 64(%edx), %edx 1573 sub $64, %ecx 1574 ja L(Shl12LoopStart) 1575 1576L(Shl12LoopLeave): 1577 add $32, %ecx 1578 jle L(shl_end_0) 1579 1580 movaps 4(%eax), %xmm2 1581 movaps 20(%eax), %xmm3 1582 palignr $12, %xmm2, %xmm3 1583 palignr $12, %xmm1, %xmm2 1584 1585 movaps %xmm2, (%edx) 1586 movaps %xmm3, 16(%edx) 1587 lea 32(%edx, %ecx), %edx 1588 lea 32(%eax, %ecx), %eax 1589 POP (%edi) 1590 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1591 1592 CFI_PUSH (%edi) 1593 1594 .p2align 4 1595L(sh_12_no_prefetch): 1596 lea -32(%ecx), %ecx 1597 lea -12(%eax), %eax 1598 xor %edi, %edi 1599 1600 .p2align 4 1601L(sh_12_no_prefetch_loop): 1602 movdqa 16(%eax, %edi), %xmm2 1603 sub $32, %ecx 1604 movdqa 32(%eax, %edi), %xmm3 1605 movdqa %xmm3, %xmm4 1606 palignr $12, %xmm2, %xmm3 1607 palignr $12, %xmm1, %xmm2 1608 lea 32(%edi), %edi 1609 movdqa %xmm2, -32(%edx, %edi) 1610 movdqa %xmm3, -16(%edx, %edi) 1611 jb L(sh_12_end_no_prefetch_loop) 1612 1613 movdqa 16(%eax, %edi), %xmm2 1614 sub $32, %ecx 1615 movdqa 32(%eax, %edi), %xmm3 1616 movdqa %xmm3, %xmm1 1617 palignr $12, %xmm2, %xmm3 1618 palignr $12, %xmm4, %xmm2 1619 lea 32(%edi), %edi 1620 movdqa %xmm2, -32(%edx, %edi) 1621 movdqa %xmm3, -16(%edx, %edi) 1622 jae L(sh_12_no_prefetch_loop) 1623 1624L(sh_12_end_no_prefetch_loop): 1625 lea 32(%ecx), %ecx 1626 add %ecx, %edi 1627 add %edi, %edx 1628 lea 12(%edi, %eax), %eax 1629 POP (%edi) 1630 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1631 1632 CFI_PUSH (%edi) 1633 1634 .p2align 4 1635L(shl_13): 1636# ifndef USE_AS_MEMMOVE 1637 movaps -13(%eax), %xmm1 1638# else 1639 movl DEST+4(%esp), %edi 1640 movaps -13(%eax), %xmm1 1641 movdqu %xmm0, (%edi) 1642# endif 1643# ifdef DATA_CACHE_SIZE_HALF 1644 cmp $DATA_CACHE_SIZE_HALF, %ecx 1645# else 1646# ifdef PIC 1647 SETUP_PIC_REG(bx) 1648 add $_GLOBAL_OFFSET_TABLE_, %ebx 1649 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1650# else 1651 cmp __x86_data_cache_size_half, %ecx 1652# endif 1653# endif 1654 jb L(sh_13_no_prefetch) 1655 1656 lea -64(%ecx), %ecx 1657 1658 .p2align 4 1659L(Shl13LoopStart): 1660 prefetcht0 0x1c0(%eax) 1661 prefetcht0 0x1c0(%edx) 1662 movaps 3(%eax), %xmm2 1663 movaps 19(%eax), %xmm3 1664 movaps 35(%eax), %xmm4 1665 movaps 51(%eax), %xmm5 1666 movaps %xmm5, %xmm7 1667 palignr $13, %xmm4, %xmm5 1668 palignr $13, %xmm3, %xmm4 1669 movaps %xmm5, 48(%edx) 1670 palignr $13, %xmm2, %xmm3 1671 lea 64(%eax), %eax 1672 palignr $13, %xmm1, %xmm2 1673 movaps %xmm4, 32(%edx) 1674 movaps %xmm3, 16(%edx) 1675 movaps %xmm7, %xmm1 1676 movaps %xmm2, (%edx) 1677 lea 64(%edx), %edx 1678 sub $64, %ecx 1679 ja L(Shl13LoopStart) 1680 1681L(Shl13LoopLeave): 1682 add $32, %ecx 1683 jle L(shl_end_0) 1684 1685 movaps 3(%eax), %xmm2 1686 movaps 19(%eax), %xmm3 1687 palignr $13, %xmm2, %xmm3 1688 palignr $13, %xmm1, %xmm2 1689 1690 movaps %xmm2, (%edx) 1691 movaps %xmm3, 16(%edx) 1692 lea 32(%edx, %ecx), %edx 1693 lea 32(%eax, %ecx), %eax 1694 POP (%edi) 1695 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1696 1697 CFI_PUSH (%edi) 1698 1699 .p2align 4 1700L(sh_13_no_prefetch): 1701 lea -32(%ecx), %ecx 1702 lea -13(%eax), %eax 1703 xor %edi, %edi 1704 1705 .p2align 4 1706L(sh_13_no_prefetch_loop): 1707 movdqa 16(%eax, %edi), %xmm2 1708 sub $32, %ecx 1709 movdqa 32(%eax, %edi), %xmm3 1710 movdqa %xmm3, %xmm4 1711 palignr $13, %xmm2, %xmm3 1712 palignr $13, %xmm1, %xmm2 1713 lea 32(%edi), %edi 1714 movdqa %xmm2, -32(%edx, %edi) 1715 movdqa %xmm3, -16(%edx, %edi) 1716 jb L(sh_13_end_no_prefetch_loop) 1717 1718 movdqa 16(%eax, %edi), %xmm2 1719 sub $32, %ecx 1720 movdqa 32(%eax, %edi), %xmm3 1721 movdqa %xmm3, %xmm1 1722 palignr $13, %xmm2, %xmm3 1723 palignr $13, %xmm4, %xmm2 1724 lea 32(%edi), %edi 1725 movdqa %xmm2, -32(%edx, %edi) 1726 movdqa %xmm3, -16(%edx, %edi) 1727 jae L(sh_13_no_prefetch_loop) 1728 1729L(sh_13_end_no_prefetch_loop): 1730 lea 32(%ecx), %ecx 1731 add %ecx, %edi 1732 add %edi, %edx 1733 lea 13(%edi, %eax), %eax 1734 POP (%edi) 1735 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1736 1737 CFI_PUSH (%edi) 1738 1739 .p2align 4 1740L(shl_14): 1741# ifndef USE_AS_MEMMOVE 1742 movaps -14(%eax), %xmm1 1743# else 1744 movl DEST+4(%esp), %edi 1745 movaps -14(%eax), %xmm1 1746 movdqu %xmm0, (%edi) 1747# endif 1748# ifdef DATA_CACHE_SIZE_HALF 1749 cmp $DATA_CACHE_SIZE_HALF, %ecx 1750# else 1751# ifdef PIC 1752 SETUP_PIC_REG(bx) 1753 add $_GLOBAL_OFFSET_TABLE_, %ebx 1754 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1755# else 1756 cmp __x86_data_cache_size_half, %ecx 1757# endif 1758# endif 1759 jb L(sh_14_no_prefetch) 1760 1761 lea -64(%ecx), %ecx 1762 1763 .p2align 4 1764L(Shl14LoopStart): 1765 prefetcht0 0x1c0(%eax) 1766 prefetcht0 0x1c0(%edx) 1767 movaps 2(%eax), %xmm2 1768 movaps 18(%eax), %xmm3 1769 movaps 34(%eax), %xmm4 1770 movaps 50(%eax), %xmm5 1771 movaps %xmm5, %xmm7 1772 palignr $14, %xmm4, %xmm5 1773 palignr $14, %xmm3, %xmm4 1774 movaps %xmm5, 48(%edx) 1775 palignr $14, %xmm2, %xmm3 1776 lea 64(%eax), %eax 1777 palignr $14, %xmm1, %xmm2 1778 movaps %xmm4, 32(%edx) 1779 movaps %xmm3, 16(%edx) 1780 movaps %xmm7, %xmm1 1781 movaps %xmm2, (%edx) 1782 lea 64(%edx), %edx 1783 sub $64, %ecx 1784 ja L(Shl14LoopStart) 1785 1786L(Shl14LoopLeave): 1787 add $32, %ecx 1788 jle L(shl_end_0) 1789 1790 movaps 2(%eax), %xmm2 1791 movaps 18(%eax), %xmm3 1792 palignr $14, %xmm2, %xmm3 1793 palignr $14, %xmm1, %xmm2 1794 1795 movaps %xmm2, (%edx) 1796 movaps %xmm3, 16(%edx) 1797 lea 32(%edx, %ecx), %edx 1798 lea 32(%eax, %ecx), %eax 1799 POP (%edi) 1800 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1801 1802 CFI_PUSH (%edi) 1803 1804 .p2align 4 1805L(sh_14_no_prefetch): 1806 lea -32(%ecx), %ecx 1807 lea -14(%eax), %eax 1808 xor %edi, %edi 1809 1810 .p2align 4 1811L(sh_14_no_prefetch_loop): 1812 movdqa 16(%eax, %edi), %xmm2 1813 sub $32, %ecx 1814 movdqa 32(%eax, %edi), %xmm3 1815 movdqa %xmm3, %xmm4 1816 palignr $14, %xmm2, %xmm3 1817 palignr $14, %xmm1, %xmm2 1818 lea 32(%edi), %edi 1819 movdqa %xmm2, -32(%edx, %edi) 1820 movdqa %xmm3, -16(%edx, %edi) 1821 jb L(sh_14_end_no_prefetch_loop) 1822 1823 movdqa 16(%eax, %edi), %xmm2 1824 sub $32, %ecx 1825 movdqa 32(%eax, %edi), %xmm3 1826 movdqa %xmm3, %xmm1 1827 palignr $14, %xmm2, %xmm3 1828 palignr $14, %xmm4, %xmm2 1829 lea 32(%edi), %edi 1830 movdqa %xmm2, -32(%edx, %edi) 1831 movdqa %xmm3, -16(%edx, %edi) 1832 jae L(sh_14_no_prefetch_loop) 1833 1834L(sh_14_end_no_prefetch_loop): 1835 lea 32(%ecx), %ecx 1836 add %ecx, %edi 1837 add %edi, %edx 1838 lea 14(%edi, %eax), %eax 1839 POP (%edi) 1840 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1841 1842 CFI_PUSH (%edi) 1843 1844 .p2align 4 1845L(shl_15): 1846# ifndef USE_AS_MEMMOVE 1847 movaps -15(%eax), %xmm1 1848# else 1849 movl DEST+4(%esp), %edi 1850 movaps -15(%eax), %xmm1 1851 movdqu %xmm0, (%edi) 1852# endif 1853# ifdef DATA_CACHE_SIZE_HALF 1854 cmp $DATA_CACHE_SIZE_HALF, %ecx 1855# else 1856# ifdef PIC 1857 SETUP_PIC_REG(bx) 1858 add $_GLOBAL_OFFSET_TABLE_, %ebx 1859 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1860# else 1861 cmp __x86_data_cache_size_half, %ecx 1862# endif 1863# endif 1864 jb L(sh_15_no_prefetch) 1865 1866 lea -64(%ecx), %ecx 1867 1868 .p2align 4 1869L(Shl15LoopStart): 1870 prefetcht0 0x1c0(%eax) 1871 prefetcht0 0x1c0(%edx) 1872 movaps 1(%eax), %xmm2 1873 movaps 17(%eax), %xmm3 1874 movaps 33(%eax), %xmm4 1875 movaps 49(%eax), %xmm5 1876 movaps %xmm5, %xmm7 1877 palignr $15, %xmm4, %xmm5 1878 palignr $15, %xmm3, %xmm4 1879 movaps %xmm5, 48(%edx) 1880 palignr $15, %xmm2, %xmm3 1881 lea 64(%eax), %eax 1882 palignr $15, %xmm1, %xmm2 1883 movaps %xmm4, 32(%edx) 1884 movaps %xmm3, 16(%edx) 1885 movaps %xmm7, %xmm1 1886 movaps %xmm2, (%edx) 1887 lea 64(%edx), %edx 1888 sub $64, %ecx 1889 ja L(Shl15LoopStart) 1890 1891L(Shl15LoopLeave): 1892 add $32, %ecx 1893 jle L(shl_end_0) 1894 1895 movaps 1(%eax), %xmm2 1896 movaps 17(%eax), %xmm3 1897 palignr $15, %xmm2, %xmm3 1898 palignr $15, %xmm1, %xmm2 1899 1900 movaps %xmm2, (%edx) 1901 movaps %xmm3, 16(%edx) 1902 lea 32(%edx, %ecx), %edx 1903 lea 32(%eax, %ecx), %eax 1904 POP (%edi) 1905 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1906 1907 CFI_PUSH (%edi) 1908 1909 .p2align 4 1910L(sh_15_no_prefetch): 1911 lea -32(%ecx), %ecx 1912 lea -15(%eax), %eax 1913 xor %edi, %edi 1914 1915 .p2align 4 1916L(sh_15_no_prefetch_loop): 1917 movdqa 16(%eax, %edi), %xmm2 1918 sub $32, %ecx 1919 movdqa 32(%eax, %edi), %xmm3 1920 movdqa %xmm3, %xmm4 1921 palignr $15, %xmm2, %xmm3 1922 palignr $15, %xmm1, %xmm2 1923 lea 32(%edi), %edi 1924 movdqa %xmm2, -32(%edx, %edi) 1925 movdqa %xmm3, -16(%edx, %edi) 1926 jb L(sh_15_end_no_prefetch_loop) 1927 1928 movdqa 16(%eax, %edi), %xmm2 1929 sub $32, %ecx 1930 movdqa 32(%eax, %edi), %xmm3 1931 movdqa %xmm3, %xmm1 1932 palignr $15, %xmm2, %xmm3 1933 palignr $15, %xmm4, %xmm2 1934 lea 32(%edi), %edi 1935 movdqa %xmm2, -32(%edx, %edi) 1936 movdqa %xmm3, -16(%edx, %edi) 1937 jae L(sh_15_no_prefetch_loop) 1938 1939L(sh_15_end_no_prefetch_loop): 1940 lea 32(%ecx), %ecx 1941 add %ecx, %edi 1942 add %edi, %edx 1943 lea 15(%edi, %eax), %eax 1944 POP (%edi) 1945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1946 1947 CFI_PUSH (%edi) 1948 1949 .p2align 4 1950L(shl_end_0): 1951 lea 32(%ecx), %ecx 1952 lea (%edx, %ecx), %edx 1953 lea (%eax, %ecx), %eax 1954 POP (%edi) 1955 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1956 1957 .p2align 4 1958L(fwd_write_44bytes): 1959 movq -44(%eax), %xmm0 1960 movq %xmm0, -44(%edx) 1961L(fwd_write_36bytes): 1962 movq -36(%eax), %xmm0 1963 movq %xmm0, -36(%edx) 1964L(fwd_write_28bytes): 1965 movq -28(%eax), %xmm0 1966 movq %xmm0, -28(%edx) 1967L(fwd_write_20bytes): 1968 movq -20(%eax), %xmm0 1969 movq %xmm0, -20(%edx) 1970L(fwd_write_12bytes): 1971 movq -12(%eax), %xmm0 1972 movq %xmm0, -12(%edx) 1973L(fwd_write_4bytes): 1974 movl -4(%eax), %ecx 1975 movl %ecx, -4(%edx) 1976# ifdef USE_AS_MEMPCPY 1977 movl %edx, %eax 1978# else 1979 movl DEST(%esp), %eax 1980# endif 1981 RETURN 1982 1983 .p2align 4 1984L(fwd_write_40bytes): 1985 movq -40(%eax), %xmm0 1986 movq %xmm0, -40(%edx) 1987L(fwd_write_32bytes): 1988 movq -32(%eax), %xmm0 1989 movq %xmm0, -32(%edx) 1990L(fwd_write_24bytes): 1991 movq -24(%eax), %xmm0 1992 movq %xmm0, -24(%edx) 1993L(fwd_write_16bytes): 1994 movq -16(%eax), %xmm0 1995 movq %xmm0, -16(%edx) 1996L(fwd_write_8bytes): 1997 movq -8(%eax), %xmm0 1998 movq %xmm0, -8(%edx) 1999L(fwd_write_0bytes): 2000# ifdef USE_AS_MEMPCPY 2001 movl %edx, %eax 2002# else 2003 movl DEST(%esp), %eax 2004# endif 2005 RETURN 2006 2007 .p2align 4 2008L(fwd_write_5bytes): 2009 movl -5(%eax), %ecx 2010 movl -4(%eax), %eax 2011 movl %ecx, -5(%edx) 2012 movl %eax, -4(%edx) 2013# ifdef USE_AS_MEMPCPY 2014 movl %edx, %eax 2015# else 2016 movl DEST(%esp), %eax 2017# endif 2018 RETURN 2019 2020 .p2align 4 2021L(fwd_write_45bytes): 2022 movq -45(%eax), %xmm0 2023 movq %xmm0, -45(%edx) 2024L(fwd_write_37bytes): 2025 movq -37(%eax), %xmm0 2026 movq %xmm0, -37(%edx) 2027L(fwd_write_29bytes): 2028 movq -29(%eax), %xmm0 2029 movq %xmm0, -29(%edx) 2030L(fwd_write_21bytes): 2031 movq -21(%eax), %xmm0 2032 movq %xmm0, -21(%edx) 2033L(fwd_write_13bytes): 2034 movq -13(%eax), %xmm0 2035 movq %xmm0, -13(%edx) 2036 movl -5(%eax), %ecx 2037 movl %ecx, -5(%edx) 2038 movzbl -1(%eax), %ecx 2039 movb %cl, -1(%edx) 2040# ifdef USE_AS_MEMPCPY 2041 movl %edx, %eax 2042# else 2043 movl DEST(%esp), %eax 2044# endif 2045 RETURN 2046 2047 .p2align 4 2048L(fwd_write_41bytes): 2049 movq -41(%eax), %xmm0 2050 movq %xmm0, -41(%edx) 2051L(fwd_write_33bytes): 2052 movq -33(%eax), %xmm0 2053 movq %xmm0, -33(%edx) 2054L(fwd_write_25bytes): 2055 movq -25(%eax), %xmm0 2056 movq %xmm0, -25(%edx) 2057L(fwd_write_17bytes): 2058 movq -17(%eax), %xmm0 2059 movq %xmm0, -17(%edx) 2060L(fwd_write_9bytes): 2061 movq -9(%eax), %xmm0 2062 movq %xmm0, -9(%edx) 2063L(fwd_write_1bytes): 2064 movzbl -1(%eax), %ecx 2065 movb %cl, -1(%edx) 2066# ifdef USE_AS_MEMPCPY 2067 movl %edx, %eax 2068# else 2069 movl DEST(%esp), %eax 2070# endif 2071 RETURN 2072 2073 .p2align 4 2074L(fwd_write_46bytes): 2075 movq -46(%eax), %xmm0 2076 movq %xmm0, -46(%edx) 2077L(fwd_write_38bytes): 2078 movq -38(%eax), %xmm0 2079 movq %xmm0, -38(%edx) 2080L(fwd_write_30bytes): 2081 movq -30(%eax), %xmm0 2082 movq %xmm0, -30(%edx) 2083L(fwd_write_22bytes): 2084 movq -22(%eax), %xmm0 2085 movq %xmm0, -22(%edx) 2086L(fwd_write_14bytes): 2087 movq -14(%eax), %xmm0 2088 movq %xmm0, -14(%edx) 2089L(fwd_write_6bytes): 2090 movl -6(%eax), %ecx 2091 movl %ecx, -6(%edx) 2092 movzwl -2(%eax), %ecx 2093 movw %cx, -2(%edx) 2094# ifdef USE_AS_MEMPCPY 2095 movl %edx, %eax 2096# else 2097 movl DEST(%esp), %eax 2098# endif 2099 RETURN 2100 2101 .p2align 4 2102L(fwd_write_42bytes): 2103 movq -42(%eax), %xmm0 2104 movq %xmm0, -42(%edx) 2105L(fwd_write_34bytes): 2106 movq -34(%eax), %xmm0 2107 movq %xmm0, -34(%edx) 2108L(fwd_write_26bytes): 2109 movq -26(%eax), %xmm0 2110 movq %xmm0, -26(%edx) 2111L(fwd_write_18bytes): 2112 movq -18(%eax), %xmm0 2113 movq %xmm0, -18(%edx) 2114L(fwd_write_10bytes): 2115 movq -10(%eax), %xmm0 2116 movq %xmm0, -10(%edx) 2117L(fwd_write_2bytes): 2118 movzwl -2(%eax), %ecx 2119 movw %cx, -2(%edx) 2120# ifdef USE_AS_MEMPCPY 2121 movl %edx, %eax 2122# else 2123 movl DEST(%esp), %eax 2124# endif 2125 RETURN 2126 2127 .p2align 4 2128L(fwd_write_47bytes): 2129 movq -47(%eax), %xmm0 2130 movq %xmm0, -47(%edx) 2131L(fwd_write_39bytes): 2132 movq -39(%eax), %xmm0 2133 movq %xmm0, -39(%edx) 2134L(fwd_write_31bytes): 2135 movq -31(%eax), %xmm0 2136 movq %xmm0, -31(%edx) 2137L(fwd_write_23bytes): 2138 movq -23(%eax), %xmm0 2139 movq %xmm0, -23(%edx) 2140L(fwd_write_15bytes): 2141 movq -15(%eax), %xmm0 2142 movq %xmm0, -15(%edx) 2143L(fwd_write_7bytes): 2144 movl -7(%eax), %ecx 2145 movl %ecx, -7(%edx) 2146 movzwl -3(%eax), %ecx 2147 movzbl -1(%eax), %eax 2148 movw %cx, -3(%edx) 2149 movb %al, -1(%edx) 2150# ifdef USE_AS_MEMPCPY 2151 movl %edx, %eax 2152# else 2153 movl DEST(%esp), %eax 2154# endif 2155 RETURN 2156 2157 .p2align 4 2158L(fwd_write_43bytes): 2159 movq -43(%eax), %xmm0 2160 movq %xmm0, -43(%edx) 2161L(fwd_write_35bytes): 2162 movq -35(%eax), %xmm0 2163 movq %xmm0, -35(%edx) 2164L(fwd_write_27bytes): 2165 movq -27(%eax), %xmm0 2166 movq %xmm0, -27(%edx) 2167L(fwd_write_19bytes): 2168 movq -19(%eax), %xmm0 2169 movq %xmm0, -19(%edx) 2170L(fwd_write_11bytes): 2171 movq -11(%eax), %xmm0 2172 movq %xmm0, -11(%edx) 2173L(fwd_write_3bytes): 2174 movzwl -3(%eax), %ecx 2175 movzbl -1(%eax), %eax 2176 movw %cx, -3(%edx) 2177 movb %al, -1(%edx) 2178# ifdef USE_AS_MEMPCPY 2179 movl %edx, %eax 2180# else 2181 movl DEST(%esp), %eax 2182# endif 2183 RETURN 2184 2185 .p2align 4 2186L(fwd_write_40bytes_align): 2187 movdqa -40(%eax), %xmm0 2188 movdqa %xmm0, -40(%edx) 2189L(fwd_write_24bytes_align): 2190 movdqa -24(%eax), %xmm0 2191 movdqa %xmm0, -24(%edx) 2192L(fwd_write_8bytes_align): 2193 movq -8(%eax), %xmm0 2194 movq %xmm0, -8(%edx) 2195L(fwd_write_0bytes_align): 2196# ifdef USE_AS_MEMPCPY 2197 movl %edx, %eax 2198# else 2199 movl DEST(%esp), %eax 2200# endif 2201 RETURN 2202 2203 .p2align 4 2204L(fwd_write_32bytes_align): 2205 movdqa -32(%eax), %xmm0 2206 movdqa %xmm0, -32(%edx) 2207L(fwd_write_16bytes_align): 2208 movdqa -16(%eax), %xmm0 2209 movdqa %xmm0, -16(%edx) 2210# ifdef USE_AS_MEMPCPY 2211 movl %edx, %eax 2212# else 2213 movl DEST(%esp), %eax 2214# endif 2215 RETURN 2216 2217 .p2align 4 2218L(fwd_write_5bytes_align): 2219 movl -5(%eax), %ecx 2220 movl -4(%eax), %eax 2221 movl %ecx, -5(%edx) 2222 movl %eax, -4(%edx) 2223# ifdef USE_AS_MEMPCPY 2224 movl %edx, %eax 2225# else 2226 movl DEST(%esp), %eax 2227# endif 2228 RETURN 2229 2230 .p2align 4 2231L(fwd_write_45bytes_align): 2232 movdqa -45(%eax), %xmm0 2233 movdqa %xmm0, -45(%edx) 2234L(fwd_write_29bytes_align): 2235 movdqa -29(%eax), %xmm0 2236 movdqa %xmm0, -29(%edx) 2237L(fwd_write_13bytes_align): 2238 movq -13(%eax), %xmm0 2239 movq %xmm0, -13(%edx) 2240 movl -5(%eax), %ecx 2241 movl %ecx, -5(%edx) 2242 movzbl -1(%eax), %ecx 2243 movb %cl, -1(%edx) 2244# ifdef USE_AS_MEMPCPY 2245 movl %edx, %eax 2246# else 2247 movl DEST(%esp), %eax 2248# endif 2249 RETURN 2250 2251 .p2align 4 2252L(fwd_write_37bytes_align): 2253 movdqa -37(%eax), %xmm0 2254 movdqa %xmm0, -37(%edx) 2255L(fwd_write_21bytes_align): 2256 movdqa -21(%eax), %xmm0 2257 movdqa %xmm0, -21(%edx) 2258 movl -5(%eax), %ecx 2259 movl %ecx, -5(%edx) 2260 movzbl -1(%eax), %ecx 2261 movb %cl, -1(%edx) 2262# ifdef USE_AS_MEMPCPY 2263 movl %edx, %eax 2264# else 2265 movl DEST(%esp), %eax 2266# endif 2267 RETURN 2268 2269 .p2align 4 2270L(fwd_write_41bytes_align): 2271 movdqa -41(%eax), %xmm0 2272 movdqa %xmm0, -41(%edx) 2273L(fwd_write_25bytes_align): 2274 movdqa -25(%eax), %xmm0 2275 movdqa %xmm0, -25(%edx) 2276L(fwd_write_9bytes_align): 2277 movq -9(%eax), %xmm0 2278 movq %xmm0, -9(%edx) 2279L(fwd_write_1bytes_align): 2280 movzbl -1(%eax), %ecx 2281 movb %cl, -1(%edx) 2282# ifdef USE_AS_MEMPCPY 2283 movl %edx, %eax 2284# else 2285 movl DEST(%esp), %eax 2286# endif 2287 RETURN 2288 2289 .p2align 4 2290L(fwd_write_33bytes_align): 2291 movdqa -33(%eax), %xmm0 2292 movdqa %xmm0, -33(%edx) 2293L(fwd_write_17bytes_align): 2294 movdqa -17(%eax), %xmm0 2295 movdqa %xmm0, -17(%edx) 2296 movzbl -1(%eax), %ecx 2297 movb %cl, -1(%edx) 2298# ifdef USE_AS_MEMPCPY 2299 movl %edx, %eax 2300# else 2301 movl DEST(%esp), %eax 2302# endif 2303 RETURN 2304 2305 .p2align 4 2306L(fwd_write_46bytes_align): 2307 movdqa -46(%eax), %xmm0 2308 movdqa %xmm0, -46(%edx) 2309L(fwd_write_30bytes_align): 2310 movdqa -30(%eax), %xmm0 2311 movdqa %xmm0, -30(%edx) 2312L(fwd_write_14bytes_align): 2313 movq -14(%eax), %xmm0 2314 movq %xmm0, -14(%edx) 2315L(fwd_write_6bytes_align): 2316 movl -6(%eax), %ecx 2317 movl %ecx, -6(%edx) 2318 movzwl -2(%eax), %ecx 2319 movw %cx, -2(%edx) 2320# ifdef USE_AS_MEMPCPY 2321 movl %edx, %eax 2322# else 2323 movl DEST(%esp), %eax 2324# endif 2325 RETURN 2326 2327 .p2align 4 2328L(fwd_write_38bytes_align): 2329 movdqa -38(%eax), %xmm0 2330 movdqa %xmm0, -38(%edx) 2331L(fwd_write_22bytes_align): 2332 movdqa -22(%eax), %xmm0 2333 movdqa %xmm0, -22(%edx) 2334 movl -6(%eax), %ecx 2335 movl %ecx, -6(%edx) 2336 movzwl -2(%eax), %ecx 2337 movw %cx, -2(%edx) 2338# ifdef USE_AS_MEMPCPY 2339 movl %edx, %eax 2340# else 2341 movl DEST(%esp), %eax 2342# endif 2343 RETURN 2344 2345 .p2align 4 2346L(fwd_write_42bytes_align): 2347 movdqa -42(%eax), %xmm0 2348 movdqa %xmm0, -42(%edx) 2349L(fwd_write_26bytes_align): 2350 movdqa -26(%eax), %xmm0 2351 movdqa %xmm0, -26(%edx) 2352L(fwd_write_10bytes_align): 2353 movq -10(%eax), %xmm0 2354 movq %xmm0, -10(%edx) 2355L(fwd_write_2bytes_align): 2356 movzwl -2(%eax), %ecx 2357 movw %cx, -2(%edx) 2358# ifdef USE_AS_MEMPCPY 2359 movl %edx, %eax 2360# else 2361 movl DEST(%esp), %eax 2362# endif 2363 RETURN 2364 2365 .p2align 4 2366L(fwd_write_34bytes_align): 2367 movdqa -34(%eax), %xmm0 2368 movdqa %xmm0, -34(%edx) 2369L(fwd_write_18bytes_align): 2370 movdqa -18(%eax), %xmm0 2371 movdqa %xmm0, -18(%edx) 2372 movzwl -2(%eax), %ecx 2373 movw %cx, -2(%edx) 2374# ifdef USE_AS_MEMPCPY 2375 movl %edx, %eax 2376# else 2377 movl DEST(%esp), %eax 2378# endif 2379 RETURN 2380 2381 .p2align 4 2382L(fwd_write_47bytes_align): 2383 movdqa -47(%eax), %xmm0 2384 movdqa %xmm0, -47(%edx) 2385L(fwd_write_31bytes_align): 2386 movdqa -31(%eax), %xmm0 2387 movdqa %xmm0, -31(%edx) 2388L(fwd_write_15bytes_align): 2389 movq -15(%eax), %xmm0 2390 movq %xmm0, -15(%edx) 2391L(fwd_write_7bytes_align): 2392 movl -7(%eax), %ecx 2393 movl %ecx, -7(%edx) 2394 movzwl -3(%eax), %ecx 2395 movzbl -1(%eax), %eax 2396 movw %cx, -3(%edx) 2397 movb %al, -1(%edx) 2398# ifdef USE_AS_MEMPCPY 2399 movl %edx, %eax 2400# else 2401 movl DEST(%esp), %eax 2402# endif 2403 RETURN 2404 2405 .p2align 4 2406L(fwd_write_39bytes_align): 2407 movdqa -39(%eax), %xmm0 2408 movdqa %xmm0, -39(%edx) 2409L(fwd_write_23bytes_align): 2410 movdqa -23(%eax), %xmm0 2411 movdqa %xmm0, -23(%edx) 2412 movl -7(%eax), %ecx 2413 movl %ecx, -7(%edx) 2414 movzwl -3(%eax), %ecx 2415 movzbl -1(%eax), %eax 2416 movw %cx, -3(%edx) 2417 movb %al, -1(%edx) 2418# ifdef USE_AS_MEMPCPY 2419 movl %edx, %eax 2420# else 2421 movl DEST(%esp), %eax 2422# endif 2423 RETURN 2424 2425 .p2align 4 2426L(fwd_write_43bytes_align): 2427 movdqa -43(%eax), %xmm0 2428 movdqa %xmm0, -43(%edx) 2429L(fwd_write_27bytes_align): 2430 movdqa -27(%eax), %xmm0 2431 movdqa %xmm0, -27(%edx) 2432L(fwd_write_11bytes_align): 2433 movq -11(%eax), %xmm0 2434 movq %xmm0, -11(%edx) 2435L(fwd_write_3bytes_align): 2436 movzwl -3(%eax), %ecx 2437 movzbl -1(%eax), %eax 2438 movw %cx, -3(%edx) 2439 movb %al, -1(%edx) 2440# ifdef USE_AS_MEMPCPY 2441 movl %edx, %eax 2442# else 2443 movl DEST(%esp), %eax 2444# endif 2445 RETURN 2446 2447 .p2align 4 2448L(fwd_write_35bytes_align): 2449 movdqa -35(%eax), %xmm0 2450 movdqa %xmm0, -35(%edx) 2451L(fwd_write_19bytes_align): 2452 movdqa -19(%eax), %xmm0 2453 movdqa %xmm0, -19(%edx) 2454 movzwl -3(%eax), %ecx 2455 movzbl -1(%eax), %eax 2456 movw %cx, -3(%edx) 2457 movb %al, -1(%edx) 2458# ifdef USE_AS_MEMPCPY 2459 movl %edx, %eax 2460# else 2461 movl DEST(%esp), %eax 2462# endif 2463 RETURN 2464 2465 .p2align 4 2466L(fwd_write_44bytes_align): 2467 movdqa -44(%eax), %xmm0 2468 movdqa %xmm0, -44(%edx) 2469L(fwd_write_28bytes_align): 2470 movdqa -28(%eax), %xmm0 2471 movdqa %xmm0, -28(%edx) 2472L(fwd_write_12bytes_align): 2473 movq -12(%eax), %xmm0 2474 movq %xmm0, -12(%edx) 2475L(fwd_write_4bytes_align): 2476 movl -4(%eax), %ecx 2477 movl %ecx, -4(%edx) 2478# ifdef USE_AS_MEMPCPY 2479 movl %edx, %eax 2480# else 2481 movl DEST(%esp), %eax 2482# endif 2483 RETURN 2484 2485 .p2align 4 2486L(fwd_write_36bytes_align): 2487 movdqa -36(%eax), %xmm0 2488 movdqa %xmm0, -36(%edx) 2489L(fwd_write_20bytes_align): 2490 movdqa -20(%eax), %xmm0 2491 movdqa %xmm0, -20(%edx) 2492 movl -4(%eax), %ecx 2493 movl %ecx, -4(%edx) 2494# ifdef USE_AS_MEMPCPY 2495 movl %edx, %eax 2496# else 2497 movl DEST(%esp), %eax 2498# endif 2499 RETURN_END 2500 2501 CFI_PUSH (%edi) 2502 2503 .p2align 4 2504L(large_page): 2505 movdqu (%eax), %xmm1 2506# ifdef USE_AS_MEMMOVE 2507 movl DEST+4(%esp), %edi 2508 movdqu %xmm0, (%edi) 2509# endif 2510 lea 16(%eax), %eax 2511 movntdq %xmm1, (%edx) 2512 lea 16(%edx), %edx 2513 lea -0x90(%ecx), %ecx 2514 POP (%edi) 2515 2516 .p2align 4 2517L(large_page_loop): 2518 movdqu (%eax), %xmm0 2519 movdqu 0x10(%eax), %xmm1 2520 movdqu 0x20(%eax), %xmm2 2521 movdqu 0x30(%eax), %xmm3 2522 movdqu 0x40(%eax), %xmm4 2523 movdqu 0x50(%eax), %xmm5 2524 movdqu 0x60(%eax), %xmm6 2525 movdqu 0x70(%eax), %xmm7 2526 lea 0x80(%eax), %eax 2527 2528 sub $0x80, %ecx 2529 movntdq %xmm0, (%edx) 2530 movntdq %xmm1, 0x10(%edx) 2531 movntdq %xmm2, 0x20(%edx) 2532 movntdq %xmm3, 0x30(%edx) 2533 movntdq %xmm4, 0x40(%edx) 2534 movntdq %xmm5, 0x50(%edx) 2535 movntdq %xmm6, 0x60(%edx) 2536 movntdq %xmm7, 0x70(%edx) 2537 lea 0x80(%edx), %edx 2538 jae L(large_page_loop) 2539 cmp $-0x40, %ecx 2540 lea 0x80(%ecx), %ecx 2541 jl L(large_page_less_64bytes) 2542 2543 movdqu (%eax), %xmm0 2544 movdqu 0x10(%eax), %xmm1 2545 movdqu 0x20(%eax), %xmm2 2546 movdqu 0x30(%eax), %xmm3 2547 lea 0x40(%eax), %eax 2548 2549 movntdq %xmm0, (%edx) 2550 movntdq %xmm1, 0x10(%edx) 2551 movntdq %xmm2, 0x20(%edx) 2552 movntdq %xmm3, 0x30(%edx) 2553 lea 0x40(%edx), %edx 2554 sub $0x40, %ecx 2555L(large_page_less_64bytes): 2556 cmp $32, %ecx 2557 jb L(large_page_less_32bytes) 2558 movdqu (%eax), %xmm0 2559 movdqu 0x10(%eax), %xmm1 2560 lea 0x20(%eax), %eax 2561 movntdq %xmm0, (%edx) 2562 movntdq %xmm1, 0x10(%edx) 2563 lea 0x20(%edx), %edx 2564 sub $0x20, %ecx 2565L(large_page_less_32bytes): 2566 add %ecx, %edx 2567 add %ecx, %eax 2568 sfence 2569 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2570 2571 .p2align 4 2572L(bk_write_44bytes): 2573 movq 36(%eax), %xmm0 2574 movq %xmm0, 36(%edx) 2575L(bk_write_36bytes): 2576 movq 28(%eax), %xmm0 2577 movq %xmm0, 28(%edx) 2578L(bk_write_28bytes): 2579 movq 20(%eax), %xmm0 2580 movq %xmm0, 20(%edx) 2581L(bk_write_20bytes): 2582 movq 12(%eax), %xmm0 2583 movq %xmm0, 12(%edx) 2584L(bk_write_12bytes): 2585 movq 4(%eax), %xmm0 2586 movq %xmm0, 4(%edx) 2587L(bk_write_4bytes): 2588 movl (%eax), %ecx 2589 movl %ecx, (%edx) 2590L(bk_write_0bytes): 2591 movl DEST(%esp), %eax 2592# ifdef USE_AS_MEMPCPY 2593 movl LEN(%esp), %ecx 2594 add %ecx, %eax 2595# endif 2596 RETURN 2597 2598 .p2align 4 2599L(bk_write_40bytes): 2600 movq 32(%eax), %xmm0 2601 movq %xmm0, 32(%edx) 2602L(bk_write_32bytes): 2603 movq 24(%eax), %xmm0 2604 movq %xmm0, 24(%edx) 2605L(bk_write_24bytes): 2606 movq 16(%eax), %xmm0 2607 movq %xmm0, 16(%edx) 2608L(bk_write_16bytes): 2609 movq 8(%eax), %xmm0 2610 movq %xmm0, 8(%edx) 2611L(bk_write_8bytes): 2612 movq (%eax), %xmm0 2613 movq %xmm0, (%edx) 2614 movl DEST(%esp), %eax 2615# ifdef USE_AS_MEMPCPY 2616 movl LEN(%esp), %ecx 2617 add %ecx, %eax 2618# endif 2619 RETURN 2620 2621 .p2align 4 2622L(bk_write_45bytes): 2623 movq 37(%eax), %xmm0 2624 movq %xmm0, 37(%edx) 2625L(bk_write_37bytes): 2626 movq 29(%eax), %xmm0 2627 movq %xmm0, 29(%edx) 2628L(bk_write_29bytes): 2629 movq 21(%eax), %xmm0 2630 movq %xmm0, 21(%edx) 2631L(bk_write_21bytes): 2632 movq 13(%eax), %xmm0 2633 movq %xmm0, 13(%edx) 2634L(bk_write_13bytes): 2635 movq 5(%eax), %xmm0 2636 movq %xmm0, 5(%edx) 2637L(bk_write_5bytes): 2638 movl 1(%eax), %ecx 2639 movl %ecx, 1(%edx) 2640L(bk_write_1bytes): 2641 movzbl (%eax), %ecx 2642 movb %cl, (%edx) 2643 movl DEST(%esp), %eax 2644# ifdef USE_AS_MEMPCPY 2645 movl LEN(%esp), %ecx 2646 add %ecx, %eax 2647# endif 2648 RETURN 2649 2650 .p2align 4 2651L(bk_write_41bytes): 2652 movq 33(%eax), %xmm0 2653 movq %xmm0, 33(%edx) 2654L(bk_write_33bytes): 2655 movq 25(%eax), %xmm0 2656 movq %xmm0, 25(%edx) 2657L(bk_write_25bytes): 2658 movq 17(%eax), %xmm0 2659 movq %xmm0, 17(%edx) 2660L(bk_write_17bytes): 2661 movq 9(%eax), %xmm0 2662 movq %xmm0, 9(%edx) 2663L(bk_write_9bytes): 2664 movq 1(%eax), %xmm0 2665 movq %xmm0, 1(%edx) 2666 movzbl (%eax), %ecx 2667 movb %cl, (%edx) 2668 movl DEST(%esp), %eax 2669# ifdef USE_AS_MEMPCPY 2670 movl LEN(%esp), %ecx 2671 add %ecx, %eax 2672# endif 2673 RETURN 2674 2675 .p2align 4 2676L(bk_write_46bytes): 2677 movq 38(%eax), %xmm0 2678 movq %xmm0, 38(%edx) 2679L(bk_write_38bytes): 2680 movq 30(%eax), %xmm0 2681 movq %xmm0, 30(%edx) 2682L(bk_write_30bytes): 2683 movq 22(%eax), %xmm0 2684 movq %xmm0, 22(%edx) 2685L(bk_write_22bytes): 2686 movq 14(%eax), %xmm0 2687 movq %xmm0, 14(%edx) 2688L(bk_write_14bytes): 2689 movq 6(%eax), %xmm0 2690 movq %xmm0, 6(%edx) 2691L(bk_write_6bytes): 2692 movl 2(%eax), %ecx 2693 movl %ecx, 2(%edx) 2694 movzwl (%eax), %ecx 2695 movw %cx, (%edx) 2696 movl DEST(%esp), %eax 2697# ifdef USE_AS_MEMPCPY 2698 movl LEN(%esp), %ecx 2699 add %ecx, %eax 2700# endif 2701 RETURN 2702 2703 .p2align 4 2704L(bk_write_42bytes): 2705 movq 34(%eax), %xmm0 2706 movq %xmm0, 34(%edx) 2707L(bk_write_34bytes): 2708 movq 26(%eax), %xmm0 2709 movq %xmm0, 26(%edx) 2710L(bk_write_26bytes): 2711 movq 18(%eax), %xmm0 2712 movq %xmm0, 18(%edx) 2713L(bk_write_18bytes): 2714 movq 10(%eax), %xmm0 2715 movq %xmm0, 10(%edx) 2716L(bk_write_10bytes): 2717 movq 2(%eax), %xmm0 2718 movq %xmm0, 2(%edx) 2719L(bk_write_2bytes): 2720 movzwl (%eax), %ecx 2721 movw %cx, (%edx) 2722 movl DEST(%esp), %eax 2723# ifdef USE_AS_MEMPCPY 2724 movl LEN(%esp), %ecx 2725 add %ecx, %eax 2726# endif 2727 RETURN 2728 2729 .p2align 4 2730L(bk_write_47bytes): 2731 movq 39(%eax), %xmm0 2732 movq %xmm0, 39(%edx) 2733L(bk_write_39bytes): 2734 movq 31(%eax), %xmm0 2735 movq %xmm0, 31(%edx) 2736L(bk_write_31bytes): 2737 movq 23(%eax), %xmm0 2738 movq %xmm0, 23(%edx) 2739L(bk_write_23bytes): 2740 movq 15(%eax), %xmm0 2741 movq %xmm0, 15(%edx) 2742L(bk_write_15bytes): 2743 movq 7(%eax), %xmm0 2744 movq %xmm0, 7(%edx) 2745L(bk_write_7bytes): 2746 movl 3(%eax), %ecx 2747 movl %ecx, 3(%edx) 2748 movzwl 1(%eax), %ecx 2749 movw %cx, 1(%edx) 2750 movzbl (%eax), %eax 2751 movb %al, (%edx) 2752 movl DEST(%esp), %eax 2753# ifdef USE_AS_MEMPCPY 2754 movl LEN(%esp), %ecx 2755 add %ecx, %eax 2756# endif 2757 RETURN 2758 2759 .p2align 4 2760L(bk_write_43bytes): 2761 movq 35(%eax), %xmm0 2762 movq %xmm0, 35(%edx) 2763L(bk_write_35bytes): 2764 movq 27(%eax), %xmm0 2765 movq %xmm0, 27(%edx) 2766L(bk_write_27bytes): 2767 movq 19(%eax), %xmm0 2768 movq %xmm0, 19(%edx) 2769L(bk_write_19bytes): 2770 movq 11(%eax), %xmm0 2771 movq %xmm0, 11(%edx) 2772L(bk_write_11bytes): 2773 movq 3(%eax), %xmm0 2774 movq %xmm0, 3(%edx) 2775L(bk_write_3bytes): 2776 movzwl 1(%eax), %ecx 2777 movw %cx, 1(%edx) 2778 movzbl (%eax), %eax 2779 movb %al, (%edx) 2780 movl DEST(%esp), %eax 2781# ifdef USE_AS_MEMPCPY 2782 movl LEN(%esp), %ecx 2783 add %ecx, %eax 2784# endif 2785 RETURN_END 2786 2787 2788 .pushsection .rodata.ssse3,"a",@progbits 2789 .p2align 2 2790L(table_48bytes_fwd): 2791 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2792 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2793 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2794 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2795 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2796 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2797 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2798 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2799 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2800 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2801 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2802 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2803 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2804 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2805 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2806 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2807 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2808 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2809 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2810 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2811 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2812 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2813 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2814 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2815 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2816 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2817 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2818 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2819 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2820 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2821 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2822 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2823 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2824 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2825 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2826 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2827 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2828 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2829 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2830 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2831 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2832 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2833 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2834 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2835 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2836 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2837 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2838 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2839 2840 .p2align 2 2841L(table_48bytes_fwd_align): 2842 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2843 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2844 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2845 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2846 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2847 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2848 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2849 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2850 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2851 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2852 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2853 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2854 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2855 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2856 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2857 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2858 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2859 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2860 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2861 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2862 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2863 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2864 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2865 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2866 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2867 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2868 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2869 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2870 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2871 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2872 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2873 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2874 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2875 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2876 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2877 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2878 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2879 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2880 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2881 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2882 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2883 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2884 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 2885 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 2886 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 2887 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 2888 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 2889 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 2890 2891 .p2align 2 2892L(shl_table): 2893 .int JMPTBL (L(shl_0), L(shl_table)) 2894 .int JMPTBL (L(shl_1), L(shl_table)) 2895 .int JMPTBL (L(shl_2), L(shl_table)) 2896 .int JMPTBL (L(shl_3), L(shl_table)) 2897 .int JMPTBL (L(shl_4), L(shl_table)) 2898 .int JMPTBL (L(shl_5), L(shl_table)) 2899 .int JMPTBL (L(shl_6), L(shl_table)) 2900 .int JMPTBL (L(shl_7), L(shl_table)) 2901 .int JMPTBL (L(shl_8), L(shl_table)) 2902 .int JMPTBL (L(shl_9), L(shl_table)) 2903 .int JMPTBL (L(shl_10), L(shl_table)) 2904 .int JMPTBL (L(shl_11), L(shl_table)) 2905 .int JMPTBL (L(shl_12), L(shl_table)) 2906 .int JMPTBL (L(shl_13), L(shl_table)) 2907 .int JMPTBL (L(shl_14), L(shl_table)) 2908 .int JMPTBL (L(shl_15), L(shl_table)) 2909 2910 .p2align 2 2911L(table_48_bytes_bwd): 2912 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 2913 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 2914 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 2915 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 2916 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 2917 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 2918 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 2919 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 2920 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 2921 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 2922 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 2923 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 2924 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 2925 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 2926 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 2927 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 2928 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 2929 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 2930 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 2931 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 2932 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 2933 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 2934 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 2935 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 2936 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 2937 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 2938 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 2939 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 2940 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 2941 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 2942 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 2943 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 2944 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 2945 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 2946 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 2947 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 2948 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 2949 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 2950 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 2951 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 2952 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 2953 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 2954 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 2955 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 2956 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 2957 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 2958 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 2959 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 2960 2961 .popsection 2962 2963# ifdef USE_AS_MEMMOVE 2964 .p2align 4 2965L(copy_backward): 2966 PUSH (%edi) 2967 movl %eax, %edi 2968 lea (%ecx,%edx,1),%edx 2969 lea (%ecx,%edi,1),%edi 2970 testl $0x3, %edx 2971 jnz L(bk_align) 2972 2973L(bk_aligned_4): 2974 cmp $64, %ecx 2975 jae L(bk_write_more64bytes) 2976 2977L(bk_write_64bytesless): 2978 cmp $32, %ecx 2979 jb L(bk_write_less32bytes) 2980 2981L(bk_write_more32bytes): 2982 /* Copy 32 bytes at a time. */ 2983 sub $32, %ecx 2984 movq -8(%edi), %xmm0 2985 movq %xmm0, -8(%edx) 2986 movq -16(%edi), %xmm0 2987 movq %xmm0, -16(%edx) 2988 movq -24(%edi), %xmm0 2989 movq %xmm0, -24(%edx) 2990 movq -32(%edi), %xmm0 2991 movq %xmm0, -32(%edx) 2992 sub $32, %edx 2993 sub $32, %edi 2994 2995L(bk_write_less32bytes): 2996 movl %edi, %eax 2997 sub %ecx, %edx 2998 sub %ecx, %eax 2999 POP (%edi) 3000L(bk_write_less32bytes_2): 3001 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3002 3003 CFI_PUSH (%edi) 3004 3005 .p2align 4 3006L(bk_align): 3007 cmp $8, %ecx 3008 jbe L(bk_write_less32bytes) 3009 testl $1, %edx 3010 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3011 then (EDX & 2) must be != 0. */ 3012 jz L(bk_got2) 3013 sub $1, %edi 3014 sub $1, %ecx 3015 sub $1, %edx 3016 movzbl (%edi), %eax 3017 movb %al, (%edx) 3018 3019 testl $2, %edx 3020 jz L(bk_aligned_4) 3021 3022L(bk_got2): 3023 sub $2, %edi 3024 sub $2, %ecx 3025 sub $2, %edx 3026 movzwl (%edi), %eax 3027 movw %ax, (%edx) 3028 jmp L(bk_aligned_4) 3029 3030 .p2align 4 3031L(bk_write_more64bytes): 3032 /* Check alignment of last byte. */ 3033 testl $15, %edx 3034 jz L(bk_ssse3_cpy_pre) 3035 3036/* EDX is aligned 4 bytes, but not 16 bytes. */ 3037L(bk_ssse3_align): 3038 sub $4, %edi 3039 sub $4, %ecx 3040 sub $4, %edx 3041 movl (%edi), %eax 3042 movl %eax, (%edx) 3043 3044 testl $15, %edx 3045 jz L(bk_ssse3_cpy_pre) 3046 3047 sub $4, %edi 3048 sub $4, %ecx 3049 sub $4, %edx 3050 movl (%edi), %eax 3051 movl %eax, (%edx) 3052 3053 testl $15, %edx 3054 jz L(bk_ssse3_cpy_pre) 3055 3056 sub $4, %edi 3057 sub $4, %ecx 3058 sub $4, %edx 3059 movl (%edi), %eax 3060 movl %eax, (%edx) 3061 3062L(bk_ssse3_cpy_pre): 3063 cmp $64, %ecx 3064 jb L(bk_write_more32bytes) 3065 3066 .p2align 4 3067L(bk_ssse3_cpy): 3068 sub $64, %edi 3069 sub $64, %ecx 3070 sub $64, %edx 3071 movdqu 0x30(%edi), %xmm3 3072 movdqa %xmm3, 0x30(%edx) 3073 movdqu 0x20(%edi), %xmm2 3074 movdqa %xmm2, 0x20(%edx) 3075 movdqu 0x10(%edi), %xmm1 3076 movdqa %xmm1, 0x10(%edx) 3077 movdqu (%edi), %xmm0 3078 movdqa %xmm0, (%edx) 3079 cmp $64, %ecx 3080 jae L(bk_ssse3_cpy) 3081 jmp L(bk_write_64bytesless) 3082 3083# endif 3084 3085END (MEMCPY) 3086 3087#endif 3088