1/* memcpy with SSSE3 and REP string. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#if IS_IN (libc) \ 22 && (defined SHARED \ 23 || defined USE_AS_MEMMOVE \ 24 || !defined USE_MULTIARCH) 25 26#include "asm-syntax.h" 27 28#ifndef MEMCPY 29# define MEMCPY __memcpy_ssse3_rep 30# define MEMCPY_CHK __memcpy_chk_ssse3_rep 31#endif 32 33#define DEST PARMS 34#define SRC DEST+4 35#define LEN SRC+4 36 37#define CFI_PUSH(REG) \ 38 cfi_adjust_cfa_offset (4); \ 39 cfi_rel_offset (REG, 0) 40 41#define CFI_POP(REG) \ 42 cfi_adjust_cfa_offset (-4); \ 43 cfi_restore (REG) 44 45#define PUSH(REG) pushl REG; CFI_PUSH (REG) 46#define POP(REG) popl REG; CFI_POP (REG) 47 48#ifdef PIC 49# define PARMS 8 /* Preserve EBX. */ 50# define ENTRANCE PUSH (%ebx); 51# define RETURN_END POP (%ebx); ret 52# define RETURN RETURN_END; CFI_PUSH (%ebx) 53# define JMPTBL(I, B) I - B 54 55/* Load an entry in a jump table into EBX and branch to it. TABLE is a 56 jump table with relative offsets. INDEX is a register contains the 57 index into the jump table. SCALE is the scale of INDEX. */ 58# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 59 /* We first load PC into EBX. */ \ 60 SETUP_PIC_REG(bx); \ 61 /* Get the address of the jump table. */ \ 62 addl $(TABLE - .), %ebx; \ 63 /* Get the entry and convert the relative offset to the \ 64 absolute address. */ \ 65 addl (%ebx,INDEX,SCALE), %ebx; \ 66 /* We loaded the jump table. Go. */ \ 67 _CET_NOTRACK jmp *%ebx 68 69# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ 70 addl $(TABLE - .), %ebx 71 72# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 73 addl (%ebx,INDEX,SCALE), %ebx; \ 74 /* We loaded the jump table. Go. */ \ 75 _CET_NOTRACK jmp *%ebx 76#else 77# define PARMS 4 78# define ENTRANCE 79# define RETURN_END ret 80# define RETURN RETURN_END 81# define JMPTBL(I, B) I 82 83/* Branch to an entry in a jump table. TABLE is a jump table with 84 absolute offsets. INDEX is a register contains the index into the 85 jump table. SCALE is the scale of INDEX. */ 86# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 87 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE) 88 89# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) 90 91# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 92 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE) 93#endif 94 95 .section .text.ssse3,"ax",@progbits 96#ifdef SHARED 97ENTRY (MEMCPY_CHK) 98 movl 12(%esp), %eax 99 cmpl %eax, 16(%esp) 100 jb HIDDEN_JUMPTARGET (__chk_fail) 101END (MEMCPY_CHK) 102#endif 103ENTRY (MEMCPY) 104 ENTRANCE 105 movl LEN(%esp), %ecx 106 movl SRC(%esp), %eax 107 movl DEST(%esp), %edx 108 109#ifdef USE_AS_MEMMOVE 110 cmp %eax, %edx 111 jb L(copy_forward) 112 je L(fwd_write_0bytes) 113 cmp $48, %ecx 114 jb L(bk_write_less48bytes) 115 add %ecx, %eax 116 cmp %eax, %edx 117 movl SRC(%esp), %eax 118 jb L(copy_backward) 119 120L(copy_forward): 121#endif 122 cmp $48, %ecx 123 jae L(48bytesormore) 124 125L(fwd_write_less32bytes): 126#ifndef USE_AS_MEMMOVE 127 cmp %dl, %al 128 jb L(bk_write) 129#endif 130 add %ecx, %edx 131 add %ecx, %eax 132 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 133#ifndef USE_AS_MEMMOVE 134L(bk_write): 135 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 136#endif 137 138 ALIGN (4) 139/* ECX > 32 and EDX is 4 byte aligned. */ 140L(48bytesormore): 141 movdqu (%eax), %xmm0 142 PUSH (%edi) 143 movl %edx, %edi 144 and $-16, %edx 145 PUSH (%esi) 146 cfi_remember_state 147 add $16, %edx 148 movl %edi, %esi 149 sub %edx, %edi 150 add %edi, %ecx 151 sub %edi, %eax 152 153#ifdef SHARED_CACHE_SIZE_HALF 154 cmp $SHARED_CACHE_SIZE_HALF, %ecx 155#else 156# ifdef PIC 157 SETUP_PIC_REG(bx) 158 add $_GLOBAL_OFFSET_TABLE_, %ebx 159 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 160# else 161 cmp __x86_shared_cache_size_half, %ecx 162# endif 163#endif 164 165 mov %eax, %edi 166 jae L(large_page) 167 and $0xf, %edi 168 jz L(shl_0) 169 170 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 171 172 ALIGN (4) 173L(shl_0): 174 movdqu %xmm0, (%esi) 175 xor %edi, %edi 176 cmp $127, %ecx 177 ja L(shl_0_gobble) 178 lea -32(%ecx), %ecx 179L(shl_0_loop): 180 movdqa (%eax, %edi), %xmm0 181 movdqa 16(%eax, %edi), %xmm1 182 sub $32, %ecx 183 movdqa %xmm0, (%edx, %edi) 184 movdqa %xmm1, 16(%edx, %edi) 185 lea 32(%edi), %edi 186 jb L(shl_0_end) 187 188 movdqa (%eax, %edi), %xmm0 189 movdqa 16(%eax, %edi), %xmm1 190 sub $32, %ecx 191 movdqa %xmm0, (%edx, %edi) 192 movdqa %xmm1, 16(%edx, %edi) 193 lea 32(%edi), %edi 194 jb L(shl_0_end) 195 196 movdqa (%eax, %edi), %xmm0 197 movdqa 16(%eax, %edi), %xmm1 198 sub $32, %ecx 199 movdqa %xmm0, (%edx, %edi) 200 movdqa %xmm1, 16(%edx, %edi) 201 lea 32(%edi), %edi 202 jb L(shl_0_end) 203 204 movdqa (%eax, %edi), %xmm0 205 movdqa 16(%eax, %edi), %xmm1 206 sub $32, %ecx 207 movdqa %xmm0, (%edx, %edi) 208 movdqa %xmm1, 16(%edx, %edi) 209 lea 32(%edi), %edi 210L(shl_0_end): 211 lea 32(%ecx), %ecx 212 add %ecx, %edi 213 add %edi, %edx 214 add %edi, %eax 215 POP (%esi) 216 POP (%edi) 217 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 218 219 cfi_restore_state 220 cfi_remember_state 221L(shl_0_gobble): 222 223#ifdef DATA_CACHE_SIZE_HALF 224 cmp $DATA_CACHE_SIZE_HALF, %ecx 225#else 226# ifdef PIC 227 SETUP_PIC_REG(bx) 228 add $_GLOBAL_OFFSET_TABLE_, %ebx 229 mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi 230# else 231 mov __x86_data_cache_size_half, %edi 232# endif 233#endif 234 mov %edi, %esi 235 shr $3, %esi 236 sub %esi, %edi 237 cmp %edi, %ecx 238 jae L(shl_0_gobble_mem_start) 239 sub $128, %ecx 240 ALIGN (4) 241L(shl_0_gobble_cache_loop): 242 movdqa (%eax), %xmm0 243 movaps 0x10(%eax), %xmm1 244 movaps 0x20(%eax), %xmm2 245 movaps 0x30(%eax), %xmm3 246 movaps 0x40(%eax), %xmm4 247 movaps 0x50(%eax), %xmm5 248 movaps 0x60(%eax), %xmm6 249 movaps 0x70(%eax), %xmm7 250 lea 0x80(%eax), %eax 251 sub $128, %ecx 252 movdqa %xmm0, (%edx) 253 movaps %xmm1, 0x10(%edx) 254 movaps %xmm2, 0x20(%edx) 255 movaps %xmm3, 0x30(%edx) 256 movaps %xmm4, 0x40(%edx) 257 movaps %xmm5, 0x50(%edx) 258 movaps %xmm6, 0x60(%edx) 259 movaps %xmm7, 0x70(%edx) 260 lea 0x80(%edx), %edx 261 262 jae L(shl_0_gobble_cache_loop) 263 add $0x80, %ecx 264 cmp $0x40, %ecx 265 jb L(shl_0_cache_less_64bytes) 266 267 movdqa (%eax), %xmm0 268 sub $0x40, %ecx 269 movdqa 0x10(%eax), %xmm1 270 271 movdqa %xmm0, (%edx) 272 movdqa %xmm1, 0x10(%edx) 273 274 movdqa 0x20(%eax), %xmm0 275 movdqa 0x30(%eax), %xmm1 276 add $0x40, %eax 277 278 movdqa %xmm0, 0x20(%edx) 279 movdqa %xmm1, 0x30(%edx) 280 add $0x40, %edx 281L(shl_0_cache_less_64bytes): 282 cmp $0x20, %ecx 283 jb L(shl_0_cache_less_32bytes) 284 movdqa (%eax), %xmm0 285 sub $0x20, %ecx 286 movdqa 0x10(%eax), %xmm1 287 add $0x20, %eax 288 movdqa %xmm0, (%edx) 289 movdqa %xmm1, 0x10(%edx) 290 add $0x20, %edx 291L(shl_0_cache_less_32bytes): 292 cmp $0x10, %ecx 293 jb L(shl_0_cache_less_16bytes) 294 sub $0x10, %ecx 295 movdqa (%eax), %xmm0 296 add $0x10, %eax 297 movdqa %xmm0, (%edx) 298 add $0x10, %edx 299L(shl_0_cache_less_16bytes): 300 add %ecx, %edx 301 add %ecx, %eax 302 POP (%esi) 303 POP (%edi) 304 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 305 306 cfi_restore_state 307 cfi_remember_state 308 ALIGN (4) 309L(shl_0_gobble_mem_start): 310 cmp %al, %dl 311 je L(copy_page_by_rep) 312 sub $128, %ecx 313L(shl_0_gobble_mem_loop): 314 prefetchnta 0x1c0(%eax) 315 prefetchnta 0x280(%eax) 316 prefetchnta 0x1c0(%edx) 317 prefetchnta 0x280(%edx) 318 319 movdqa (%eax), %xmm0 320 movaps 0x10(%eax), %xmm1 321 movaps 0x20(%eax), %xmm2 322 movaps 0x30(%eax), %xmm3 323 movaps 0x40(%eax), %xmm4 324 movaps 0x50(%eax), %xmm5 325 movaps 0x60(%eax), %xmm6 326 movaps 0x70(%eax), %xmm7 327 lea 0x80(%eax), %eax 328 sub $0x80, %ecx 329 movdqa %xmm0, (%edx) 330 movaps %xmm1, 0x10(%edx) 331 movaps %xmm2, 0x20(%edx) 332 movaps %xmm3, 0x30(%edx) 333 movaps %xmm4, 0x40(%edx) 334 movaps %xmm5, 0x50(%edx) 335 movaps %xmm6, 0x60(%edx) 336 movaps %xmm7, 0x70(%edx) 337 lea 0x80(%edx), %edx 338 339 jae L(shl_0_gobble_mem_loop) 340 add $0x80, %ecx 341 cmp $0x40, %ecx 342 jb L(shl_0_mem_less_64bytes) 343 344 movdqa (%eax), %xmm0 345 sub $0x40, %ecx 346 movdqa 0x10(%eax), %xmm1 347 348 movdqa %xmm0, (%edx) 349 movdqa %xmm1, 0x10(%edx) 350 351 movdqa 0x20(%eax), %xmm0 352 movdqa 0x30(%eax), %xmm1 353 add $0x40, %eax 354 355 movdqa %xmm0, 0x20(%edx) 356 movdqa %xmm1, 0x30(%edx) 357 add $0x40, %edx 358L(shl_0_mem_less_64bytes): 359 cmp $0x20, %ecx 360 jb L(shl_0_mem_less_32bytes) 361 movdqa (%eax), %xmm0 362 sub $0x20, %ecx 363 movdqa 0x10(%eax), %xmm1 364 add $0x20, %eax 365 movdqa %xmm0, (%edx) 366 movdqa %xmm1, 0x10(%edx) 367 add $0x20, %edx 368L(shl_0_mem_less_32bytes): 369 cmp $0x10, %ecx 370 jb L(shl_0_mem_less_16bytes) 371 sub $0x10, %ecx 372 movdqa (%eax), %xmm0 373 add $0x10, %eax 374 movdqa %xmm0, (%edx) 375 add $0x10, %edx 376L(shl_0_mem_less_16bytes): 377 add %ecx, %edx 378 add %ecx, %eax 379 POP (%esi) 380 POP (%edi) 381 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 382 383 cfi_restore_state 384 cfi_remember_state 385 ALIGN (4) 386L(shl_1): 387 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 388 sub $1, %eax 389 movaps (%eax), %xmm1 390 xor %edi, %edi 391 sub $32, %ecx 392 movdqu %xmm0, (%esi) 393 POP (%esi) 394L(shl_1_loop): 395 396 movdqa 16(%eax, %edi), %xmm2 397 sub $32, %ecx 398 movdqa 32(%eax, %edi), %xmm3 399 movdqa %xmm3, %xmm4 400 palignr $1, %xmm2, %xmm3 401 palignr $1, %xmm1, %xmm2 402 lea 32(%edi), %edi 403 movdqa %xmm2, -32(%edx, %edi) 404 movdqa %xmm3, -16(%edx, %edi) 405 406 jb L(shl_1_end) 407 408 movdqa 16(%eax, %edi), %xmm2 409 sub $32, %ecx 410 movdqa 32(%eax, %edi), %xmm3 411 movdqa %xmm3, %xmm1 412 palignr $1, %xmm2, %xmm3 413 palignr $1, %xmm4, %xmm2 414 lea 32(%edi), %edi 415 movdqa %xmm2, -32(%edx, %edi) 416 movdqa %xmm3, -16(%edx, %edi) 417 418 jae L(shl_1_loop) 419 420L(shl_1_end): 421 add $32, %ecx 422 add %ecx, %edi 423 add %edi, %edx 424 lea 1(%edi, %eax), %eax 425 POP (%edi) 426 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 427 428 cfi_restore_state 429 cfi_remember_state 430 ALIGN (4) 431L(shl_2): 432 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 433 sub $2, %eax 434 movaps (%eax), %xmm1 435 xor %edi, %edi 436 sub $32, %ecx 437 movdqu %xmm0, (%esi) 438 POP (%esi) 439L(shl_2_loop): 440 441 movdqa 16(%eax, %edi), %xmm2 442 sub $32, %ecx 443 movdqa 32(%eax, %edi), %xmm3 444 movdqa %xmm3, %xmm4 445 palignr $2, %xmm2, %xmm3 446 palignr $2, %xmm1, %xmm2 447 lea 32(%edi), %edi 448 movdqa %xmm2, -32(%edx, %edi) 449 movdqa %xmm3, -16(%edx, %edi) 450 451 jb L(shl_2_end) 452 453 movdqa 16(%eax, %edi), %xmm2 454 sub $32, %ecx 455 movdqa 32(%eax, %edi), %xmm3 456 movdqa %xmm3, %xmm1 457 palignr $2, %xmm2, %xmm3 458 palignr $2, %xmm4, %xmm2 459 lea 32(%edi), %edi 460 movdqa %xmm2, -32(%edx, %edi) 461 movdqa %xmm3, -16(%edx, %edi) 462 463 jae L(shl_2_loop) 464 465L(shl_2_end): 466 add $32, %ecx 467 add %ecx, %edi 468 add %edi, %edx 469 lea 2(%edi, %eax), %eax 470 POP (%edi) 471 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 472 473 cfi_restore_state 474 cfi_remember_state 475 ALIGN (4) 476L(shl_3): 477 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 478 sub $3, %eax 479 movaps (%eax), %xmm1 480 xor %edi, %edi 481 sub $32, %ecx 482 movdqu %xmm0, (%esi) 483 POP (%esi) 484L(shl_3_loop): 485 486 movdqa 16(%eax, %edi), %xmm2 487 sub $32, %ecx 488 movdqa 32(%eax, %edi), %xmm3 489 movdqa %xmm3, %xmm4 490 palignr $3, %xmm2, %xmm3 491 palignr $3, %xmm1, %xmm2 492 lea 32(%edi), %edi 493 movdqa %xmm2, -32(%edx, %edi) 494 movdqa %xmm3, -16(%edx, %edi) 495 496 jb L(shl_3_end) 497 498 movdqa 16(%eax, %edi), %xmm2 499 sub $32, %ecx 500 movdqa 32(%eax, %edi), %xmm3 501 movdqa %xmm3, %xmm1 502 palignr $3, %xmm2, %xmm3 503 palignr $3, %xmm4, %xmm2 504 lea 32(%edi), %edi 505 movdqa %xmm2, -32(%edx, %edi) 506 movdqa %xmm3, -16(%edx, %edi) 507 508 jae L(shl_3_loop) 509 510L(shl_3_end): 511 add $32, %ecx 512 add %ecx, %edi 513 add %edi, %edx 514 lea 3(%edi, %eax), %eax 515 POP (%edi) 516 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 517 518 cfi_restore_state 519 cfi_remember_state 520 ALIGN (4) 521L(shl_4): 522 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 523 sub $4, %eax 524 movaps (%eax), %xmm1 525 xor %edi, %edi 526 sub $32, %ecx 527 movdqu %xmm0, (%esi) 528 POP (%esi) 529L(shl_4_loop): 530 531 movdqa 16(%eax, %edi), %xmm2 532 sub $32, %ecx 533 movdqa 32(%eax, %edi), %xmm3 534 movdqa %xmm3, %xmm4 535 palignr $4, %xmm2, %xmm3 536 palignr $4, %xmm1, %xmm2 537 lea 32(%edi), %edi 538 movdqa %xmm2, -32(%edx, %edi) 539 movdqa %xmm3, -16(%edx, %edi) 540 541 jb L(shl_4_end) 542 543 movdqa 16(%eax, %edi), %xmm2 544 sub $32, %ecx 545 movdqa 32(%eax, %edi), %xmm3 546 movdqa %xmm3, %xmm1 547 palignr $4, %xmm2, %xmm3 548 palignr $4, %xmm4, %xmm2 549 lea 32(%edi), %edi 550 movdqa %xmm2, -32(%edx, %edi) 551 movdqa %xmm3, -16(%edx, %edi) 552 553 jae L(shl_4_loop) 554 555L(shl_4_end): 556 add $32, %ecx 557 add %ecx, %edi 558 add %edi, %edx 559 lea 4(%edi, %eax), %eax 560 POP (%edi) 561 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 562 563 cfi_restore_state 564 cfi_remember_state 565 ALIGN (4) 566L(shl_5): 567 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 568 sub $5, %eax 569 movaps (%eax), %xmm1 570 xor %edi, %edi 571 sub $32, %ecx 572 movdqu %xmm0, (%esi) 573 POP (%esi) 574L(shl_5_loop): 575 576 movdqa 16(%eax, %edi), %xmm2 577 sub $32, %ecx 578 movdqa 32(%eax, %edi), %xmm3 579 movdqa %xmm3, %xmm4 580 palignr $5, %xmm2, %xmm3 581 palignr $5, %xmm1, %xmm2 582 lea 32(%edi), %edi 583 movdqa %xmm2, -32(%edx, %edi) 584 movdqa %xmm3, -16(%edx, %edi) 585 586 jb L(shl_5_end) 587 588 movdqa 16(%eax, %edi), %xmm2 589 sub $32, %ecx 590 movdqa 32(%eax, %edi), %xmm3 591 movdqa %xmm3, %xmm1 592 palignr $5, %xmm2, %xmm3 593 palignr $5, %xmm4, %xmm2 594 lea 32(%edi), %edi 595 movdqa %xmm2, -32(%edx, %edi) 596 movdqa %xmm3, -16(%edx, %edi) 597 598 jae L(shl_5_loop) 599 600L(shl_5_end): 601 add $32, %ecx 602 add %ecx, %edi 603 add %edi, %edx 604 lea 5(%edi, %eax), %eax 605 POP (%edi) 606 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 607 608 cfi_restore_state 609 cfi_remember_state 610 ALIGN (4) 611L(shl_6): 612 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 613 sub $6, %eax 614 movaps (%eax), %xmm1 615 xor %edi, %edi 616 sub $32, %ecx 617 movdqu %xmm0, (%esi) 618 POP (%esi) 619L(shl_6_loop): 620 621 movdqa 16(%eax, %edi), %xmm2 622 sub $32, %ecx 623 movdqa 32(%eax, %edi), %xmm3 624 movdqa %xmm3, %xmm4 625 palignr $6, %xmm2, %xmm3 626 palignr $6, %xmm1, %xmm2 627 lea 32(%edi), %edi 628 movdqa %xmm2, -32(%edx, %edi) 629 movdqa %xmm3, -16(%edx, %edi) 630 631 jb L(shl_6_end) 632 633 movdqa 16(%eax, %edi), %xmm2 634 sub $32, %ecx 635 movdqa 32(%eax, %edi), %xmm3 636 movdqa %xmm3, %xmm1 637 palignr $6, %xmm2, %xmm3 638 palignr $6, %xmm4, %xmm2 639 lea 32(%edi), %edi 640 movdqa %xmm2, -32(%edx, %edi) 641 movdqa %xmm3, -16(%edx, %edi) 642 643 jae L(shl_6_loop) 644 645L(shl_6_end): 646 add $32, %ecx 647 add %ecx, %edi 648 add %edi, %edx 649 lea 6(%edi, %eax), %eax 650 POP (%edi) 651 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 652 653 cfi_restore_state 654 cfi_remember_state 655 ALIGN (4) 656L(shl_7): 657 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 658 sub $7, %eax 659 movaps (%eax), %xmm1 660 xor %edi, %edi 661 sub $32, %ecx 662 movdqu %xmm0, (%esi) 663 POP (%esi) 664L(shl_7_loop): 665 666 movdqa 16(%eax, %edi), %xmm2 667 sub $32, %ecx 668 movdqa 32(%eax, %edi), %xmm3 669 movdqa %xmm3, %xmm4 670 palignr $7, %xmm2, %xmm3 671 palignr $7, %xmm1, %xmm2 672 lea 32(%edi), %edi 673 movdqa %xmm2, -32(%edx, %edi) 674 movdqa %xmm3, -16(%edx, %edi) 675 676 jb L(shl_7_end) 677 678 movdqa 16(%eax, %edi), %xmm2 679 sub $32, %ecx 680 movdqa 32(%eax, %edi), %xmm3 681 movdqa %xmm3, %xmm1 682 palignr $7, %xmm2, %xmm3 683 palignr $7, %xmm4, %xmm2 684 lea 32(%edi), %edi 685 movdqa %xmm2, -32(%edx, %edi) 686 movdqa %xmm3, -16(%edx, %edi) 687 688 jae L(shl_7_loop) 689 690L(shl_7_end): 691 add $32, %ecx 692 add %ecx, %edi 693 add %edi, %edx 694 lea 7(%edi, %eax), %eax 695 POP (%edi) 696 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 697 698 cfi_restore_state 699 cfi_remember_state 700 ALIGN (4) 701L(shl_8): 702 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 703 sub $8, %eax 704 movaps (%eax), %xmm1 705 xor %edi, %edi 706 sub $32, %ecx 707 movdqu %xmm0, (%esi) 708 POP (%esi) 709L(shl_8_loop): 710 711 movdqa 16(%eax, %edi), %xmm2 712 sub $32, %ecx 713 movdqa 32(%eax, %edi), %xmm3 714 movdqa %xmm3, %xmm4 715 palignr $8, %xmm2, %xmm3 716 palignr $8, %xmm1, %xmm2 717 lea 32(%edi), %edi 718 movdqa %xmm2, -32(%edx, %edi) 719 movdqa %xmm3, -16(%edx, %edi) 720 721 jb L(shl_8_end) 722 723 movdqa 16(%eax, %edi), %xmm2 724 sub $32, %ecx 725 movdqa 32(%eax, %edi), %xmm3 726 movdqa %xmm3, %xmm1 727 palignr $8, %xmm2, %xmm3 728 palignr $8, %xmm4, %xmm2 729 lea 32(%edi), %edi 730 movdqa %xmm2, -32(%edx, %edi) 731 movdqa %xmm3, -16(%edx, %edi) 732 733 jae L(shl_8_loop) 734 735L(shl_8_end): 736 add $32, %ecx 737 add %ecx, %edi 738 add %edi, %edx 739 lea 8(%edi, %eax), %eax 740 POP (%edi) 741 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 742 743 cfi_restore_state 744 cfi_remember_state 745 ALIGN (4) 746L(shl_9): 747 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 748 sub $9, %eax 749 movaps (%eax), %xmm1 750 xor %edi, %edi 751 sub $32, %ecx 752 movdqu %xmm0, (%esi) 753 POP (%esi) 754L(shl_9_loop): 755 756 movdqa 16(%eax, %edi), %xmm2 757 sub $32, %ecx 758 movdqa 32(%eax, %edi), %xmm3 759 movdqa %xmm3, %xmm4 760 palignr $9, %xmm2, %xmm3 761 palignr $9, %xmm1, %xmm2 762 lea 32(%edi), %edi 763 movdqa %xmm2, -32(%edx, %edi) 764 movdqa %xmm3, -16(%edx, %edi) 765 766 jb L(shl_9_end) 767 768 movdqa 16(%eax, %edi), %xmm2 769 sub $32, %ecx 770 movdqa 32(%eax, %edi), %xmm3 771 movdqa %xmm3, %xmm1 772 palignr $9, %xmm2, %xmm3 773 palignr $9, %xmm4, %xmm2 774 lea 32(%edi), %edi 775 movdqa %xmm2, -32(%edx, %edi) 776 movdqa %xmm3, -16(%edx, %edi) 777 778 jae L(shl_9_loop) 779 780L(shl_9_end): 781 add $32, %ecx 782 add %ecx, %edi 783 add %edi, %edx 784 lea 9(%edi, %eax), %eax 785 POP (%edi) 786 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 787 788 cfi_restore_state 789 cfi_remember_state 790 ALIGN (4) 791L(shl_10): 792 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 793 sub $10, %eax 794 movaps (%eax), %xmm1 795 xor %edi, %edi 796 sub $32, %ecx 797 movdqu %xmm0, (%esi) 798 POP (%esi) 799L(shl_10_loop): 800 801 movdqa 16(%eax, %edi), %xmm2 802 sub $32, %ecx 803 movdqa 32(%eax, %edi), %xmm3 804 movdqa %xmm3, %xmm4 805 palignr $10, %xmm2, %xmm3 806 palignr $10, %xmm1, %xmm2 807 lea 32(%edi), %edi 808 movdqa %xmm2, -32(%edx, %edi) 809 movdqa %xmm3, -16(%edx, %edi) 810 811 jb L(shl_10_end) 812 813 movdqa 16(%eax, %edi), %xmm2 814 sub $32, %ecx 815 movdqa 32(%eax, %edi), %xmm3 816 movdqa %xmm3, %xmm1 817 palignr $10, %xmm2, %xmm3 818 palignr $10, %xmm4, %xmm2 819 lea 32(%edi), %edi 820 movdqa %xmm2, -32(%edx, %edi) 821 movdqa %xmm3, -16(%edx, %edi) 822 823 jae L(shl_10_loop) 824 825L(shl_10_end): 826 add $32, %ecx 827 add %ecx, %edi 828 add %edi, %edx 829 lea 10(%edi, %eax), %eax 830 POP (%edi) 831 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 832 833 cfi_restore_state 834 cfi_remember_state 835 ALIGN (4) 836L(shl_11): 837 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 838 sub $11, %eax 839 movaps (%eax), %xmm1 840 xor %edi, %edi 841 sub $32, %ecx 842 movdqu %xmm0, (%esi) 843 POP (%esi) 844L(shl_11_loop): 845 846 movdqa 16(%eax, %edi), %xmm2 847 sub $32, %ecx 848 movdqa 32(%eax, %edi), %xmm3 849 movdqa %xmm3, %xmm4 850 palignr $11, %xmm2, %xmm3 851 palignr $11, %xmm1, %xmm2 852 lea 32(%edi), %edi 853 movdqa %xmm2, -32(%edx, %edi) 854 movdqa %xmm3, -16(%edx, %edi) 855 856 jb L(shl_11_end) 857 858 movdqa 16(%eax, %edi), %xmm2 859 sub $32, %ecx 860 movdqa 32(%eax, %edi), %xmm3 861 movdqa %xmm3, %xmm1 862 palignr $11, %xmm2, %xmm3 863 palignr $11, %xmm4, %xmm2 864 lea 32(%edi), %edi 865 movdqa %xmm2, -32(%edx, %edi) 866 movdqa %xmm3, -16(%edx, %edi) 867 868 jae L(shl_11_loop) 869 870L(shl_11_end): 871 add $32, %ecx 872 add %ecx, %edi 873 add %edi, %edx 874 lea 11(%edi, %eax), %eax 875 POP (%edi) 876 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 877 878 cfi_restore_state 879 cfi_remember_state 880 ALIGN (4) 881L(shl_12): 882 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 883 sub $12, %eax 884 movaps (%eax), %xmm1 885 xor %edi, %edi 886 sub $32, %ecx 887 movdqu %xmm0, (%esi) 888 POP (%esi) 889L(shl_12_loop): 890 891 movdqa 16(%eax, %edi), %xmm2 892 sub $32, %ecx 893 movdqa 32(%eax, %edi), %xmm3 894 movdqa %xmm3, %xmm4 895 palignr $12, %xmm2, %xmm3 896 palignr $12, %xmm1, %xmm2 897 lea 32(%edi), %edi 898 movdqa %xmm2, -32(%edx, %edi) 899 movdqa %xmm3, -16(%edx, %edi) 900 901 jb L(shl_12_end) 902 903 movdqa 16(%eax, %edi), %xmm2 904 sub $32, %ecx 905 movdqa 32(%eax, %edi), %xmm3 906 movdqa %xmm3, %xmm1 907 palignr $12, %xmm2, %xmm3 908 palignr $12, %xmm4, %xmm2 909 lea 32(%edi), %edi 910 movdqa %xmm2, -32(%edx, %edi) 911 movdqa %xmm3, -16(%edx, %edi) 912 913 jae L(shl_12_loop) 914 915L(shl_12_end): 916 add $32, %ecx 917 add %ecx, %edi 918 add %edi, %edx 919 lea 12(%edi, %eax), %eax 920 POP (%edi) 921 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 922 923 cfi_restore_state 924 cfi_remember_state 925 ALIGN (4) 926L(shl_13): 927 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 928 sub $13, %eax 929 movaps (%eax), %xmm1 930 xor %edi, %edi 931 sub $32, %ecx 932 movdqu %xmm0, (%esi) 933 POP (%esi) 934L(shl_13_loop): 935 936 movdqa 16(%eax, %edi), %xmm2 937 sub $32, %ecx 938 movdqa 32(%eax, %edi), %xmm3 939 movdqa %xmm3, %xmm4 940 palignr $13, %xmm2, %xmm3 941 palignr $13, %xmm1, %xmm2 942 lea 32(%edi), %edi 943 movdqa %xmm2, -32(%edx, %edi) 944 movdqa %xmm3, -16(%edx, %edi) 945 946 jb L(shl_13_end) 947 948 movdqa 16(%eax, %edi), %xmm2 949 sub $32, %ecx 950 movdqa 32(%eax, %edi), %xmm3 951 movdqa %xmm3, %xmm1 952 palignr $13, %xmm2, %xmm3 953 palignr $13, %xmm4, %xmm2 954 lea 32(%edi), %edi 955 movdqa %xmm2, -32(%edx, %edi) 956 movdqa %xmm3, -16(%edx, %edi) 957 958 jae L(shl_13_loop) 959 960L(shl_13_end): 961 add $32, %ecx 962 add %ecx, %edi 963 add %edi, %edx 964 lea 13(%edi, %eax), %eax 965 POP (%edi) 966 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 967 968 cfi_restore_state 969 cfi_remember_state 970 ALIGN (4) 971L(shl_14): 972 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 973 sub $14, %eax 974 movaps (%eax), %xmm1 975 xor %edi, %edi 976 sub $32, %ecx 977 movdqu %xmm0, (%esi) 978 POP (%esi) 979L(shl_14_loop): 980 981 movdqa 16(%eax, %edi), %xmm2 982 sub $32, %ecx 983 movdqa 32(%eax, %edi), %xmm3 984 movdqa %xmm3, %xmm4 985 palignr $14, %xmm2, %xmm3 986 palignr $14, %xmm1, %xmm2 987 lea 32(%edi), %edi 988 movdqa %xmm2, -32(%edx, %edi) 989 movdqa %xmm3, -16(%edx, %edi) 990 991 jb L(shl_14_end) 992 993 movdqa 16(%eax, %edi), %xmm2 994 sub $32, %ecx 995 movdqa 32(%eax, %edi), %xmm3 996 movdqa %xmm3, %xmm1 997 palignr $14, %xmm2, %xmm3 998 palignr $14, %xmm4, %xmm2 999 lea 32(%edi), %edi 1000 movdqa %xmm2, -32(%edx, %edi) 1001 movdqa %xmm3, -16(%edx, %edi) 1002 1003 jae L(shl_14_loop) 1004 1005L(shl_14_end): 1006 add $32, %ecx 1007 add %ecx, %edi 1008 add %edi, %edx 1009 lea 14(%edi, %eax), %eax 1010 POP (%edi) 1011 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1012 1013 cfi_restore_state 1014 cfi_remember_state 1015 ALIGN (4) 1016L(shl_15): 1017 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 1018 sub $15, %eax 1019 movaps (%eax), %xmm1 1020 xor %edi, %edi 1021 sub $32, %ecx 1022 movdqu %xmm0, (%esi) 1023 POP (%esi) 1024L(shl_15_loop): 1025 1026 movdqa 16(%eax, %edi), %xmm2 1027 sub $32, %ecx 1028 movdqa 32(%eax, %edi), %xmm3 1029 movdqa %xmm3, %xmm4 1030 palignr $15, %xmm2, %xmm3 1031 palignr $15, %xmm1, %xmm2 1032 lea 32(%edi), %edi 1033 movdqa %xmm2, -32(%edx, %edi) 1034 movdqa %xmm3, -16(%edx, %edi) 1035 1036 jb L(shl_15_end) 1037 1038 movdqa 16(%eax, %edi), %xmm2 1039 sub $32, %ecx 1040 movdqa 32(%eax, %edi), %xmm3 1041 movdqa %xmm3, %xmm1 1042 palignr $15, %xmm2, %xmm3 1043 palignr $15, %xmm4, %xmm2 1044 lea 32(%edi), %edi 1045 movdqa %xmm2, -32(%edx, %edi) 1046 movdqa %xmm3, -16(%edx, %edi) 1047 1048 jae L(shl_15_loop) 1049 1050L(shl_15_end): 1051 add $32, %ecx 1052 add %ecx, %edi 1053 add %edi, %edx 1054 lea 15(%edi, %eax), %eax 1055 POP (%edi) 1056 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1057 1058 1059 ALIGN (4) 1060L(fwd_write_44bytes): 1061 movl -44(%eax), %ecx 1062 movl %ecx, -44(%edx) 1063L(fwd_write_40bytes): 1064 movl -40(%eax), %ecx 1065 movl %ecx, -40(%edx) 1066L(fwd_write_36bytes): 1067 movl -36(%eax), %ecx 1068 movl %ecx, -36(%edx) 1069L(fwd_write_32bytes): 1070 movl -32(%eax), %ecx 1071 movl %ecx, -32(%edx) 1072L(fwd_write_28bytes): 1073 movl -28(%eax), %ecx 1074 movl %ecx, -28(%edx) 1075L(fwd_write_24bytes): 1076 movl -24(%eax), %ecx 1077 movl %ecx, -24(%edx) 1078L(fwd_write_20bytes): 1079 movl -20(%eax), %ecx 1080 movl %ecx, -20(%edx) 1081L(fwd_write_16bytes): 1082 movl -16(%eax), %ecx 1083 movl %ecx, -16(%edx) 1084L(fwd_write_12bytes): 1085 movl -12(%eax), %ecx 1086 movl %ecx, -12(%edx) 1087L(fwd_write_8bytes): 1088 movl -8(%eax), %ecx 1089 movl %ecx, -8(%edx) 1090L(fwd_write_4bytes): 1091 movl -4(%eax), %ecx 1092 movl %ecx, -4(%edx) 1093L(fwd_write_0bytes): 1094#ifdef USE_AS_MEMPCPY 1095 movl %edx, %eax 1096#else 1097 movl DEST(%esp), %eax 1098#endif 1099 RETURN 1100 1101 ALIGN (4) 1102L(fwd_write_5bytes): 1103 movl -5(%eax), %ecx 1104 movl -4(%eax), %eax 1105 movl %ecx, -5(%edx) 1106 movl %eax, -4(%edx) 1107#ifdef USE_AS_MEMPCPY 1108 movl %edx, %eax 1109#else 1110 movl DEST(%esp), %eax 1111#endif 1112 RETURN 1113 1114 ALIGN (4) 1115L(fwd_write_45bytes): 1116 movl -45(%eax), %ecx 1117 movl %ecx, -45(%edx) 1118L(fwd_write_41bytes): 1119 movl -41(%eax), %ecx 1120 movl %ecx, -41(%edx) 1121L(fwd_write_37bytes): 1122 movl -37(%eax), %ecx 1123 movl %ecx, -37(%edx) 1124L(fwd_write_33bytes): 1125 movl -33(%eax), %ecx 1126 movl %ecx, -33(%edx) 1127L(fwd_write_29bytes): 1128 movl -29(%eax), %ecx 1129 movl %ecx, -29(%edx) 1130L(fwd_write_25bytes): 1131 movl -25(%eax), %ecx 1132 movl %ecx, -25(%edx) 1133L(fwd_write_21bytes): 1134 movl -21(%eax), %ecx 1135 movl %ecx, -21(%edx) 1136L(fwd_write_17bytes): 1137 movl -17(%eax), %ecx 1138 movl %ecx, -17(%edx) 1139L(fwd_write_13bytes): 1140 movl -13(%eax), %ecx 1141 movl %ecx, -13(%edx) 1142L(fwd_write_9bytes): 1143 movl -9(%eax), %ecx 1144 movl %ecx, -9(%edx) 1145 movl -5(%eax), %ecx 1146 movl %ecx, -5(%edx) 1147L(fwd_write_1bytes): 1148 movzbl -1(%eax), %ecx 1149 movb %cl, -1(%edx) 1150#ifdef USE_AS_MEMPCPY 1151 movl %edx, %eax 1152#else 1153 movl DEST(%esp), %eax 1154#endif 1155 RETURN 1156 1157 ALIGN (4) 1158L(fwd_write_46bytes): 1159 movl -46(%eax), %ecx 1160 movl %ecx, -46(%edx) 1161L(fwd_write_42bytes): 1162 movl -42(%eax), %ecx 1163 movl %ecx, -42(%edx) 1164L(fwd_write_38bytes): 1165 movl -38(%eax), %ecx 1166 movl %ecx, -38(%edx) 1167L(fwd_write_34bytes): 1168 movl -34(%eax), %ecx 1169 movl %ecx, -34(%edx) 1170L(fwd_write_30bytes): 1171 movl -30(%eax), %ecx 1172 movl %ecx, -30(%edx) 1173L(fwd_write_26bytes): 1174 movl -26(%eax), %ecx 1175 movl %ecx, -26(%edx) 1176L(fwd_write_22bytes): 1177 movl -22(%eax), %ecx 1178 movl %ecx, -22(%edx) 1179L(fwd_write_18bytes): 1180 movl -18(%eax), %ecx 1181 movl %ecx, -18(%edx) 1182L(fwd_write_14bytes): 1183 movl -14(%eax), %ecx 1184 movl %ecx, -14(%edx) 1185L(fwd_write_10bytes): 1186 movl -10(%eax), %ecx 1187 movl %ecx, -10(%edx) 1188L(fwd_write_6bytes): 1189 movl -6(%eax), %ecx 1190 movl %ecx, -6(%edx) 1191L(fwd_write_2bytes): 1192 movzwl -2(%eax), %ecx 1193 movw %cx, -2(%edx) 1194#ifdef USE_AS_MEMPCPY 1195 movl %edx, %eax 1196#else 1197 movl DEST(%esp), %eax 1198#endif 1199 RETURN 1200 1201 ALIGN (4) 1202L(fwd_write_47bytes): 1203 movl -47(%eax), %ecx 1204 movl %ecx, -47(%edx) 1205L(fwd_write_43bytes): 1206 movl -43(%eax), %ecx 1207 movl %ecx, -43(%edx) 1208L(fwd_write_39bytes): 1209 movl -39(%eax), %ecx 1210 movl %ecx, -39(%edx) 1211L(fwd_write_35bytes): 1212 movl -35(%eax), %ecx 1213 movl %ecx, -35(%edx) 1214L(fwd_write_31bytes): 1215 movl -31(%eax), %ecx 1216 movl %ecx, -31(%edx) 1217L(fwd_write_27bytes): 1218 movl -27(%eax), %ecx 1219 movl %ecx, -27(%edx) 1220L(fwd_write_23bytes): 1221 movl -23(%eax), %ecx 1222 movl %ecx, -23(%edx) 1223L(fwd_write_19bytes): 1224 movl -19(%eax), %ecx 1225 movl %ecx, -19(%edx) 1226L(fwd_write_15bytes): 1227 movl -15(%eax), %ecx 1228 movl %ecx, -15(%edx) 1229L(fwd_write_11bytes): 1230 movl -11(%eax), %ecx 1231 movl %ecx, -11(%edx) 1232L(fwd_write_7bytes): 1233 movl -7(%eax), %ecx 1234 movl %ecx, -7(%edx) 1235L(fwd_write_3bytes): 1236 movzwl -3(%eax), %ecx 1237 movzbl -1(%eax), %eax 1238 movw %cx, -3(%edx) 1239 movb %al, -1(%edx) 1240#ifdef USE_AS_MEMPCPY 1241 movl %edx, %eax 1242#else 1243 movl DEST(%esp), %eax 1244#endif 1245 RETURN_END 1246 1247 cfi_restore_state 1248 cfi_remember_state 1249 ALIGN (4) 1250L(large_page): 1251 movdqu (%eax), %xmm1 1252 movdqu %xmm0, (%esi) 1253 movntdq %xmm1, (%edx) 1254 add $0x10, %eax 1255 add $0x10, %edx 1256 sub $0x10, %ecx 1257 cmp %al, %dl 1258 je L(copy_page_by_rep) 1259L(large_page_loop_init): 1260 POP (%esi) 1261 sub $0x80, %ecx 1262 POP (%edi) 1263L(large_page_loop): 1264 prefetchnta 0x1c0(%eax) 1265 prefetchnta 0x280(%eax) 1266 movdqu (%eax), %xmm0 1267 movdqu 0x10(%eax), %xmm1 1268 movdqu 0x20(%eax), %xmm2 1269 movdqu 0x30(%eax), %xmm3 1270 movdqu 0x40(%eax), %xmm4 1271 movdqu 0x50(%eax), %xmm5 1272 movdqu 0x60(%eax), %xmm6 1273 movdqu 0x70(%eax), %xmm7 1274 lea 0x80(%eax), %eax 1275 lfence 1276 sub $0x80, %ecx 1277 movntdq %xmm0, (%edx) 1278 movntdq %xmm1, 0x10(%edx) 1279 movntdq %xmm2, 0x20(%edx) 1280 movntdq %xmm3, 0x30(%edx) 1281 movntdq %xmm4, 0x40(%edx) 1282 movntdq %xmm5, 0x50(%edx) 1283 movntdq %xmm6, 0x60(%edx) 1284 movntdq %xmm7, 0x70(%edx) 1285 lea 0x80(%edx), %edx 1286 jae L(large_page_loop) 1287 add $0x80, %ecx 1288 cmp $0x40, %ecx 1289 jb L(large_page_less_64bytes) 1290 1291 movdqu (%eax), %xmm0 1292 movdqu 0x10(%eax), %xmm1 1293 movdqu 0x20(%eax), %xmm2 1294 movdqu 0x30(%eax), %xmm3 1295 lea 0x40(%eax), %eax 1296 1297 movntdq %xmm0, (%edx) 1298 movntdq %xmm1, 0x10(%edx) 1299 movntdq %xmm2, 0x20(%edx) 1300 movntdq %xmm3, 0x30(%edx) 1301 lea 0x40(%edx), %edx 1302 sub $0x40, %ecx 1303L(large_page_less_64bytes): 1304 cmp $32, %ecx 1305 jb L(large_page_less_32bytes) 1306 movdqu (%eax), %xmm0 1307 movdqu 0x10(%eax), %xmm1 1308 lea 0x20(%eax), %eax 1309 movntdq %xmm0, (%edx) 1310 movntdq %xmm1, 0x10(%edx) 1311 lea 0x20(%edx), %edx 1312 sub $0x20, %ecx 1313L(large_page_less_32bytes): 1314 add %ecx, %edx 1315 add %ecx, %eax 1316 sfence 1317 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 1318 1319 cfi_restore_state 1320 cfi_remember_state 1321 ALIGN (4) 1322L(copy_page_by_rep): 1323 mov %eax, %esi 1324 mov %edx, %edi 1325 mov %ecx, %edx 1326 shr $2, %ecx 1327 and $3, %edx 1328 rep movsl 1329 jz L(copy_page_by_rep_exit) 1330 cmp $2, %edx 1331 jb L(copy_page_by_rep_left_1) 1332 movzwl (%esi), %eax 1333 movw %ax, (%edi) 1334 add $2, %esi 1335 add $2, %edi 1336 sub $2, %edx 1337 jz L(copy_page_by_rep_exit) 1338L(copy_page_by_rep_left_1): 1339 movzbl (%esi), %eax 1340 movb %al, (%edi) 1341L(copy_page_by_rep_exit): 1342 POP (%esi) 1343 POP (%edi) 1344 movl DEST(%esp), %eax 1345#ifdef USE_AS_MEMPCPY 1346 movl LEN(%esp), %ecx 1347 add %ecx, %eax 1348#endif 1349 RETURN 1350 1351 ALIGN (4) 1352L(bk_write_44bytes): 1353 movl 40(%eax), %ecx 1354 movl %ecx, 40(%edx) 1355L(bk_write_40bytes): 1356 movl 36(%eax), %ecx 1357 movl %ecx, 36(%edx) 1358L(bk_write_36bytes): 1359 movl 32(%eax), %ecx 1360 movl %ecx, 32(%edx) 1361L(bk_write_32bytes): 1362 movl 28(%eax), %ecx 1363 movl %ecx, 28(%edx) 1364L(bk_write_28bytes): 1365 movl 24(%eax), %ecx 1366 movl %ecx, 24(%edx) 1367L(bk_write_24bytes): 1368 movl 20(%eax), %ecx 1369 movl %ecx, 20(%edx) 1370L(bk_write_20bytes): 1371 movl 16(%eax), %ecx 1372 movl %ecx, 16(%edx) 1373L(bk_write_16bytes): 1374 movl 12(%eax), %ecx 1375 movl %ecx, 12(%edx) 1376L(bk_write_12bytes): 1377 movl 8(%eax), %ecx 1378 movl %ecx, 8(%edx) 1379L(bk_write_8bytes): 1380 movl 4(%eax), %ecx 1381 movl %ecx, 4(%edx) 1382L(bk_write_4bytes): 1383 movl (%eax), %ecx 1384 movl %ecx, (%edx) 1385L(bk_write_0bytes): 1386 movl DEST(%esp), %eax 1387#ifdef USE_AS_MEMPCPY 1388 movl LEN(%esp), %ecx 1389 add %ecx, %eax 1390#endif 1391 RETURN 1392 1393 ALIGN (4) 1394L(bk_write_45bytes): 1395 movl 41(%eax), %ecx 1396 movl %ecx, 41(%edx) 1397L(bk_write_41bytes): 1398 movl 37(%eax), %ecx 1399 movl %ecx, 37(%edx) 1400L(bk_write_37bytes): 1401 movl 33(%eax), %ecx 1402 movl %ecx, 33(%edx) 1403L(bk_write_33bytes): 1404 movl 29(%eax), %ecx 1405 movl %ecx, 29(%edx) 1406L(bk_write_29bytes): 1407 movl 25(%eax), %ecx 1408 movl %ecx, 25(%edx) 1409L(bk_write_25bytes): 1410 movl 21(%eax), %ecx 1411 movl %ecx, 21(%edx) 1412L(bk_write_21bytes): 1413 movl 17(%eax), %ecx 1414 movl %ecx, 17(%edx) 1415L(bk_write_17bytes): 1416 movl 13(%eax), %ecx 1417 movl %ecx, 13(%edx) 1418L(bk_write_13bytes): 1419 movl 9(%eax), %ecx 1420 movl %ecx, 9(%edx) 1421L(bk_write_9bytes): 1422 movl 5(%eax), %ecx 1423 movl %ecx, 5(%edx) 1424L(bk_write_5bytes): 1425 movl 1(%eax), %ecx 1426 movl %ecx, 1(%edx) 1427L(bk_write_1bytes): 1428 movzbl (%eax), %ecx 1429 movb %cl, (%edx) 1430 movl DEST(%esp), %eax 1431#ifdef USE_AS_MEMPCPY 1432 movl LEN(%esp), %ecx 1433 add %ecx, %eax 1434#endif 1435 RETURN 1436 1437 ALIGN (4) 1438L(bk_write_46bytes): 1439 movl 42(%eax), %ecx 1440 movl %ecx, 42(%edx) 1441L(bk_write_42bytes): 1442 movl 38(%eax), %ecx 1443 movl %ecx, 38(%edx) 1444L(bk_write_38bytes): 1445 movl 34(%eax), %ecx 1446 movl %ecx, 34(%edx) 1447L(bk_write_34bytes): 1448 movl 30(%eax), %ecx 1449 movl %ecx, 30(%edx) 1450L(bk_write_30bytes): 1451 movl 26(%eax), %ecx 1452 movl %ecx, 26(%edx) 1453L(bk_write_26bytes): 1454 movl 22(%eax), %ecx 1455 movl %ecx, 22(%edx) 1456L(bk_write_22bytes): 1457 movl 18(%eax), %ecx 1458 movl %ecx, 18(%edx) 1459L(bk_write_18bytes): 1460 movl 14(%eax), %ecx 1461 movl %ecx, 14(%edx) 1462L(bk_write_14bytes): 1463 movl 10(%eax), %ecx 1464 movl %ecx, 10(%edx) 1465L(bk_write_10bytes): 1466 movl 6(%eax), %ecx 1467 movl %ecx, 6(%edx) 1468L(bk_write_6bytes): 1469 movl 2(%eax), %ecx 1470 movl %ecx, 2(%edx) 1471L(bk_write_2bytes): 1472 movzwl (%eax), %ecx 1473 movw %cx, (%edx) 1474 movl DEST(%esp), %eax 1475#ifdef USE_AS_MEMPCPY 1476 movl LEN(%esp), %ecx 1477 add %ecx, %eax 1478#endif 1479 RETURN 1480 1481 ALIGN (4) 1482L(bk_write_47bytes): 1483 movl 43(%eax), %ecx 1484 movl %ecx, 43(%edx) 1485L(bk_write_43bytes): 1486 movl 39(%eax), %ecx 1487 movl %ecx, 39(%edx) 1488L(bk_write_39bytes): 1489 movl 35(%eax), %ecx 1490 movl %ecx, 35(%edx) 1491L(bk_write_35bytes): 1492 movl 31(%eax), %ecx 1493 movl %ecx, 31(%edx) 1494L(bk_write_31bytes): 1495 movl 27(%eax), %ecx 1496 movl %ecx, 27(%edx) 1497L(bk_write_27bytes): 1498 movl 23(%eax), %ecx 1499 movl %ecx, 23(%edx) 1500L(bk_write_23bytes): 1501 movl 19(%eax), %ecx 1502 movl %ecx, 19(%edx) 1503L(bk_write_19bytes): 1504 movl 15(%eax), %ecx 1505 movl %ecx, 15(%edx) 1506L(bk_write_15bytes): 1507 movl 11(%eax), %ecx 1508 movl %ecx, 11(%edx) 1509L(bk_write_11bytes): 1510 movl 7(%eax), %ecx 1511 movl %ecx, 7(%edx) 1512L(bk_write_7bytes): 1513 movl 3(%eax), %ecx 1514 movl %ecx, 3(%edx) 1515L(bk_write_3bytes): 1516 movzwl 1(%eax), %ecx 1517 movw %cx, 1(%edx) 1518 movzbl (%eax), %eax 1519 movb %al, (%edx) 1520 movl DEST(%esp), %eax 1521#ifdef USE_AS_MEMPCPY 1522 movl LEN(%esp), %ecx 1523 add %ecx, %eax 1524#endif 1525 RETURN_END 1526 1527 1528 .pushsection .rodata.ssse3,"a",@progbits 1529 ALIGN (2) 1530L(table_48bytes_fwd): 1531 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 1532 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 1533 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 1534 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 1535 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 1536 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 1537 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 1538 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 1539 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 1540 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 1541 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 1542 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 1543 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 1544 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 1545 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 1546 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 1547 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 1548 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 1549 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 1550 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 1551 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 1552 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 1553 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 1554 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 1555 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 1556 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 1557 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 1558 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 1559 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 1560 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 1561 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 1562 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 1563 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 1564 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 1565 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 1566 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 1567 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 1568 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 1569 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 1570 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 1571 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 1572 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 1573 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 1574 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 1575 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 1576 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 1577 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 1578 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 1579 1580 ALIGN (2) 1581L(shl_table): 1582 .int JMPTBL (L(shl_0), L(shl_table)) 1583 .int JMPTBL (L(shl_1), L(shl_table)) 1584 .int JMPTBL (L(shl_2), L(shl_table)) 1585 .int JMPTBL (L(shl_3), L(shl_table)) 1586 .int JMPTBL (L(shl_4), L(shl_table)) 1587 .int JMPTBL (L(shl_5), L(shl_table)) 1588 .int JMPTBL (L(shl_6), L(shl_table)) 1589 .int JMPTBL (L(shl_7), L(shl_table)) 1590 .int JMPTBL (L(shl_8), L(shl_table)) 1591 .int JMPTBL (L(shl_9), L(shl_table)) 1592 .int JMPTBL (L(shl_10), L(shl_table)) 1593 .int JMPTBL (L(shl_11), L(shl_table)) 1594 .int JMPTBL (L(shl_12), L(shl_table)) 1595 .int JMPTBL (L(shl_13), L(shl_table)) 1596 .int JMPTBL (L(shl_14), L(shl_table)) 1597 .int JMPTBL (L(shl_15), L(shl_table)) 1598 1599 ALIGN (2) 1600L(table_48_bytes_bwd): 1601 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 1602 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 1603 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 1604 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 1605 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 1606 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 1607 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 1608 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 1609 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 1610 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 1611 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 1612 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 1613 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 1614 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 1615 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 1616 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 1617 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 1618 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 1619 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 1620 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 1621 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 1622 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 1623 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 1624 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 1625 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 1626 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 1627 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 1628 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 1629 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 1630 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 1631 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 1632 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 1633 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 1634 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 1635 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 1636 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 1637 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 1638 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 1639 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 1640 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 1641 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 1642 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 1643 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 1644 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 1645 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 1646 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 1647 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 1648 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 1649 1650 .popsection 1651 1652#ifdef USE_AS_MEMMOVE 1653 ALIGN (4) 1654L(copy_backward): 1655 PUSH (%esi) 1656 movl %eax, %esi 1657 add %ecx, %edx 1658 add %ecx, %esi 1659 testl $0x3, %edx 1660 jnz L(bk_align) 1661 1662L(bk_aligned_4): 1663 cmp $64, %ecx 1664 jae L(bk_write_more64bytes) 1665 1666L(bk_write_64bytesless): 1667 cmp $32, %ecx 1668 jb L(bk_write_less32bytes) 1669 1670L(bk_write_more32bytes): 1671 /* Copy 32 bytes at a time. */ 1672 sub $32, %ecx 1673 movl -4(%esi), %eax 1674 movl %eax, -4(%edx) 1675 movl -8(%esi), %eax 1676 movl %eax, -8(%edx) 1677 movl -12(%esi), %eax 1678 movl %eax, -12(%edx) 1679 movl -16(%esi), %eax 1680 movl %eax, -16(%edx) 1681 movl -20(%esi), %eax 1682 movl %eax, -20(%edx) 1683 movl -24(%esi), %eax 1684 movl %eax, -24(%edx) 1685 movl -28(%esi), %eax 1686 movl %eax, -28(%edx) 1687 movl -32(%esi), %eax 1688 movl %eax, -32(%edx) 1689 sub $32, %edx 1690 sub $32, %esi 1691 1692L(bk_write_less32bytes): 1693 movl %esi, %eax 1694 sub %ecx, %edx 1695 sub %ecx, %eax 1696 POP (%esi) 1697L(bk_write_less48bytes): 1698 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 1699 1700 CFI_PUSH (%esi) 1701 ALIGN (4) 1702L(bk_align): 1703 cmp $8, %ecx 1704 jbe L(bk_write_less32bytes) 1705 testl $1, %edx 1706 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 1707 then (EDX & 2) must be != 0. */ 1708 jz L(bk_got2) 1709 sub $1, %esi 1710 sub $1, %ecx 1711 sub $1, %edx 1712 movzbl (%esi), %eax 1713 movb %al, (%edx) 1714 1715 testl $2, %edx 1716 jz L(bk_aligned_4) 1717 1718L(bk_got2): 1719 sub $2, %esi 1720 sub $2, %ecx 1721 sub $2, %edx 1722 movzwl (%esi), %eax 1723 movw %ax, (%edx) 1724 jmp L(bk_aligned_4) 1725 1726 ALIGN (4) 1727L(bk_write_more64bytes): 1728 /* Check alignment of last byte. */ 1729 testl $15, %edx 1730 jz L(bk_ssse3_cpy_pre) 1731 1732/* EDX is aligned 4 bytes, but not 16 bytes. */ 1733L(bk_ssse3_align): 1734 sub $4, %esi 1735 sub $4, %ecx 1736 sub $4, %edx 1737 movl (%esi), %eax 1738 movl %eax, (%edx) 1739 1740 testl $15, %edx 1741 jz L(bk_ssse3_cpy_pre) 1742 1743 sub $4, %esi 1744 sub $4, %ecx 1745 sub $4, %edx 1746 movl (%esi), %eax 1747 movl %eax, (%edx) 1748 1749 testl $15, %edx 1750 jz L(bk_ssse3_cpy_pre) 1751 1752 sub $4, %esi 1753 sub $4, %ecx 1754 sub $4, %edx 1755 movl (%esi), %eax 1756 movl %eax, (%edx) 1757 1758L(bk_ssse3_cpy_pre): 1759 cmp $64, %ecx 1760 jb L(bk_write_more32bytes) 1761 1762L(bk_ssse3_cpy): 1763 sub $64, %esi 1764 sub $64, %ecx 1765 sub $64, %edx 1766 movdqu 0x30(%esi), %xmm3 1767 movdqa %xmm3, 0x30(%edx) 1768 movdqu 0x20(%esi), %xmm2 1769 movdqa %xmm2, 0x20(%edx) 1770 movdqu 0x10(%esi), %xmm1 1771 movdqa %xmm1, 0x10(%edx) 1772 movdqu (%esi), %xmm0 1773 movdqa %xmm0, (%edx) 1774 cmp $64, %ecx 1775 jae L(bk_ssse3_cpy) 1776 jmp L(bk_write_64bytesless) 1777 1778#endif 1779 1780END (MEMCPY) 1781 1782#endif 1783