1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb 2 Copyright (C) 2016-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* memmove/memcpy/mempcpy is implemented as: 20 1. Use overlapping load and store to avoid branch. 21 2. Load all sources into registers and store them together to avoid 22 possible address overlap between source and destination. 23 3. If size is 8 * VEC_SIZE or less, load all sources into registers 24 and store them together. 25 4. If address of destination > address of source, backward copy 26 4 * VEC_SIZE at a time with unaligned load and aligned store. 27 Load the first 4 * VEC and last VEC before the loop and store 28 them after the loop to support overlapping addresses. 29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned 30 load and aligned store. Load the last 4 * VEC and first VEC 31 before the loop and store them after the loop to support 32 overlapping addresses. 33 6. On machines with ERMS feature, if size greater than equal or to 34 __x86_rep_movsb_threshold and less than 35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. 36 7. If size >= __x86_shared_non_temporal_threshold and there is no 37 overlap between destination and source, use non-temporal store 38 instead of aligned store copying from either 2 or 4 pages at 39 once. 40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold 41 and source and destination do not page alias, copy from 2 pages 42 at once using non-temporal stores. Page aliasing in this case is 43 considered true if destination's page alignment - sources' page 44 alignment is less than 8 * VEC_SIZE. 45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source 46 and destination do page alias copy from 4 pages at once using 47 non-temporal stores. */ 48 49#include <sysdep.h> 50 51#ifndef MEMCPY_SYMBOL 52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 53#endif 54 55#ifndef MEMPCPY_SYMBOL 56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 57#endif 58 59#ifndef MEMMOVE_CHK_SYMBOL 60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 61#endif 62 63#ifndef XMM0 64# define XMM0 xmm0 65#endif 66 67#ifndef YMM0 68# define YMM0 ymm0 69#endif 70 71#ifndef VZEROUPPER 72# if VEC_SIZE > 16 73# define VZEROUPPER vzeroupper 74# else 75# define VZEROUPPER 76# endif 77#endif 78 79/* Whether to align before movsb. Ultimately we want 64 byte 80 align and not worth it to load 4x VEC for VEC_SIZE == 16. */ 81#define ALIGN_MOVSB (VEC_SIZE > 16) 82/* Number of bytes to align movsb to. */ 83#define MOVSB_ALIGN_TO 64 84 85#define SMALL_MOV_SIZE (MOV_SIZE <= 4) 86#define LARGE_MOV_SIZE (MOV_SIZE > 4) 87 88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 89# error MOV_SIZE Unknown 90#endif 91 92#if LARGE_MOV_SIZE 93# define SMALL_SIZE_OFFSET (4) 94#else 95# define SMALL_SIZE_OFFSET (0) 96#endif 97 98#ifndef PAGE_SIZE 99# define PAGE_SIZE 4096 100#endif 101 102#if PAGE_SIZE != 4096 103# error Unsupported PAGE_SIZE 104#endif 105 106#ifndef LOG_PAGE_SIZE 107# define LOG_PAGE_SIZE 12 108#endif 109 110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) 111# error Invalid LOG_PAGE_SIZE 112#endif 113 114/* Byte per page for large_memcpy inner loop. */ 115#if VEC_SIZE == 64 116# define LARGE_LOAD_SIZE (VEC_SIZE * 2) 117#else 118# define LARGE_LOAD_SIZE (VEC_SIZE * 4) 119#endif 120 121/* Amount to shift __x86_shared_non_temporal_threshold by for 122 bound for memcpy_large_4x. This is essentially use to to 123 indicate that the copy is far beyond the scope of L3 124 (assuming no user config x86_non_temporal_threshold) and to 125 use a more aggressively unrolled loop. NB: before 126 increasing the value also update initialization of 127 x86_non_temporal_threshold. */ 128#ifndef LOG_4X_MEMCPY_THRESH 129# define LOG_4X_MEMCPY_THRESH 4 130#endif 131 132/* Avoid short distance rep movsb only with non-SSE vector. */ 133#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB 134# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) 135#else 136# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 137#endif 138 139#ifndef PREFETCH 140# define PREFETCH(addr) prefetcht0 addr 141#endif 142 143/* Assume 64-byte prefetch size. */ 144#ifndef PREFETCH_SIZE 145# define PREFETCH_SIZE 64 146#endif 147 148#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) 149 150#if PREFETCH_SIZE == 64 151# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE 152# define PREFETCH_ONE_SET(dir, base, offset) \ 153 PREFETCH ((offset)base) 154# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE 155# define PREFETCH_ONE_SET(dir, base, offset) \ 156 PREFETCH ((offset)base); \ 157 PREFETCH ((offset + dir * PREFETCH_SIZE)base) 158# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE 159# define PREFETCH_ONE_SET(dir, base, offset) \ 160 PREFETCH ((offset)base); \ 161 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ 162 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ 163 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) 164# else 165# error Unsupported PREFETCHED_LOAD_SIZE! 166# endif 167#else 168# error Unsupported PREFETCH_SIZE! 169#endif 170 171#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) 172# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ 173 VMOVU (offset)base, vec0; \ 174 VMOVU ((offset) + VEC_SIZE)base, vec1; 175# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ 176 VMOVNT vec0, (offset)base; \ 177 VMOVNT vec1, ((offset) + VEC_SIZE)base; 178#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) 179# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ 180 VMOVU (offset)base, vec0; \ 181 VMOVU ((offset) + VEC_SIZE)base, vec1; \ 182 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ 183 VMOVU ((offset) + VEC_SIZE * 3)base, vec3; 184# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ 185 VMOVNT vec0, (offset)base; \ 186 VMOVNT vec1, ((offset) + VEC_SIZE)base; \ 187 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ 188 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; 189#else 190# error Invalid LARGE_LOAD_SIZE 191#endif 192 193#ifndef SECTION 194# error SECTION is not defined! 195#endif 196 197 .section SECTION(.text),"ax",@progbits 198#if defined SHARED && IS_IN (libc) 199ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) 200 cmp %RDX_LP, %RCX_LP 201 jb HIDDEN_JUMPTARGET (__chk_fail) 202END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) 203#endif 204 205ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) 206 mov %RDI_LP, %RAX_LP 207 add %RDX_LP, %RAX_LP 208 jmp L(start) 209END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) 210 211#if defined SHARED && IS_IN (libc) 212ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) 213 cmp %RDX_LP, %RCX_LP 214 jb HIDDEN_JUMPTARGET (__chk_fail) 215END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) 216#endif 217 218ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) 219 movq %rdi, %rax 220L(start): 221# ifdef __ILP32__ 222 /* Clear the upper 32 bits. */ 223 movl %edx, %edx 224# endif 225 cmp $VEC_SIZE, %RDX_LP 226 jb L(less_vec) 227 /* Load regardless. */ 228 VMOVU (%rsi), %VEC(0) 229 cmp $(VEC_SIZE * 2), %RDX_LP 230 ja L(more_2x_vec) 231 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ 232 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) 233 VMOVU %VEC(0), (%rdi) 234 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) 235#if !(defined USE_MULTIARCH && IS_IN (libc)) 236 ZERO_UPPER_VEC_REGISTERS_RETURN 237#else 238 VZEROUPPER_RETURN 239#endif 240#if defined USE_MULTIARCH && IS_IN (libc) 241END (MEMMOVE_SYMBOL (__memmove, unaligned)) 242 243# ifdef SHARED 244ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) 245 cmp %RDX_LP, %RCX_LP 246 jb HIDDEN_JUMPTARGET (__chk_fail) 247END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) 248# endif 249 250ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) 251 mov %RDI_LP, %RAX_LP 252 add %RDX_LP, %RAX_LP 253 jmp L(start_erms) 254END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) 255 256# ifdef SHARED 257ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) 258 cmp %RDX_LP, %RCX_LP 259 jb HIDDEN_JUMPTARGET (__chk_fail) 260END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) 261# endif 262 263ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) 264 movq %rdi, %rax 265L(start_erms): 266# ifdef __ILP32__ 267 /* Clear the upper 32 bits. */ 268 movl %edx, %edx 269# endif 270 cmp $VEC_SIZE, %RDX_LP 271 jb L(less_vec) 272 /* Load regardless. */ 273 VMOVU (%rsi), %VEC(0) 274 cmp $(VEC_SIZE * 2), %RDX_LP 275 ja L(movsb_more_2x_vec) 276 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. 277 */ 278 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) 279 VMOVU %VEC(0), (%rdi) 280 VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) 281L(return): 282# if VEC_SIZE > 16 283 ZERO_UPPER_VEC_REGISTERS_RETURN 284# else 285 ret 286# endif 287#endif 288 289#if LARGE_MOV_SIZE 290 /* If LARGE_MOV_SIZE this fits in the aligning bytes between the 291 ENTRY block and L(less_vec). */ 292 .p2align 4,, 8 293L(between_4_7): 294 /* From 4 to 7. No branch when size == 4. */ 295 movl (%rsi), %ecx 296 movl (%rsi, %rdx), %esi 297 movl %ecx, (%rdi) 298 movl %esi, (%rdi, %rdx) 299 ret 300#endif 301 302 .p2align 4 303L(less_vec): 304 /* Less than 1 VEC. */ 305#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 306# error Unsupported VEC_SIZE! 307#endif 308#if VEC_SIZE > 32 309 cmpl $32, %edx 310 jae L(between_32_63) 311#endif 312#if VEC_SIZE > 16 313 cmpl $16, %edx 314 jae L(between_16_31) 315#endif 316 cmpl $8, %edx 317 jae L(between_8_15) 318#if SMALL_MOV_SIZE 319 cmpl $4, %edx 320#else 321 subq $4, %rdx 322#endif 323 jae L(between_4_7) 324 cmpl $(1 - SMALL_SIZE_OFFSET), %edx 325 jl L(copy_0) 326 movb (%rsi), %cl 327 je L(copy_1) 328 movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi 329 movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) 330L(copy_1): 331 movb %cl, (%rdi) 332L(copy_0): 333 ret 334 335#if SMALL_MOV_SIZE 336 .p2align 4,, 8 337L(between_4_7): 338 /* From 4 to 7. No branch when size == 4. */ 339 movl -4(%rsi, %rdx), %ecx 340 movl (%rsi), %esi 341 movl %ecx, -4(%rdi, %rdx) 342 movl %esi, (%rdi) 343 ret 344#endif 345 346#if VEC_SIZE > 16 347 /* From 16 to 31. No branch when size == 16. */ 348 .p2align 4,, 8 349L(between_16_31): 350 vmovdqu (%rsi), %xmm0 351 vmovdqu -16(%rsi, %rdx), %xmm1 352 vmovdqu %xmm0, (%rdi) 353 vmovdqu %xmm1, -16(%rdi, %rdx) 354 /* No ymm registers have been touched. */ 355 ret 356#endif 357 358#if VEC_SIZE > 32 359 .p2align 4,, 10 360L(between_32_63): 361 /* From 32 to 63. No branch when size == 32. */ 362 VMOVU (%rsi), %YMM0 363 VMOVU -32(%rsi, %rdx), %YMM1 364 VMOVU %YMM0, (%rdi) 365 VMOVU %YMM1, -32(%rdi, %rdx) 366 VZEROUPPER_RETURN 367#endif 368 369 .p2align 4,, 10 370L(between_8_15): 371 /* From 8 to 15. No branch when size == 8. */ 372 movq -8(%rsi, %rdx), %rcx 373 movq (%rsi), %rsi 374 movq %rsi, (%rdi) 375 movq %rcx, -8(%rdi, %rdx) 376 ret 377 378 .p2align 4,, 10 379L(last_4x_vec): 380 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ 381 382 /* VEC(0) and VEC(1) have already been loaded. */ 383 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) 384 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) 385 VMOVU %VEC(0), (%rdi) 386 VMOVU %VEC(1), VEC_SIZE(%rdi) 387 VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) 388 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) 389 VZEROUPPER_RETURN 390 391 .p2align 4 392#if defined USE_MULTIARCH && IS_IN (libc) 393L(movsb_more_2x_vec): 394 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP 395 ja L(movsb) 396#endif 397L(more_2x_vec): 398 /* More than 2 * VEC and there may be overlap between 399 destination and source. */ 400 cmpq $(VEC_SIZE * 8), %rdx 401 ja L(more_8x_vec) 402 /* Load VEC(1) regardless. VEC(0) has already been loaded. */ 403 VMOVU VEC_SIZE(%rsi), %VEC(1) 404 cmpq $(VEC_SIZE * 4), %rdx 405 jbe L(last_4x_vec) 406 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ 407 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 408 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 409 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) 410 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) 411 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) 412 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) 413 VMOVU %VEC(0), (%rdi) 414 VMOVU %VEC(1), VEC_SIZE(%rdi) 415 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) 416 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) 417 VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) 418 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) 419 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) 420 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) 421 VZEROUPPER_RETURN 422 423 .p2align 4,, 4 424L(more_8x_vec): 425 movq %rdi, %rcx 426 subq %rsi, %rcx 427 /* Go to backwards temporal copy if overlap no matter what as 428 backward REP MOVSB is slow and we don't want to use NT stores if 429 there is overlap. */ 430 cmpq %rdx, %rcx 431 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ 432 jb L(more_8x_vec_backward_check_nop) 433 /* Check if non-temporal move candidate. */ 434#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) 435 /* Check non-temporal store threshold. */ 436 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP 437 ja L(large_memcpy_2x) 438#endif 439 /* To reach this point there cannot be overlap and dst > src. So 440 check for overlap and src > dst in which case correctness 441 requires forward copy. Otherwise decide between backward/forward 442 copy depending on address aliasing. */ 443 444 /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold 445 but less than __x86_shared_non_temporal_threshold. */ 446L(more_8x_vec_check): 447 /* rcx contains dst - src. Add back length (rdx). */ 448 leaq (%rcx, %rdx), %r8 449 /* If r8 has different sign than rcx then there is overlap so we 450 must do forward copy. */ 451 xorq %rcx, %r8 452 /* Isolate just sign bit of r8. */ 453 shrq $63, %r8 454 /* Get 4k difference dst - src. */ 455 andl $(PAGE_SIZE - 256), %ecx 456 /* If r8 is non-zero must do foward for correctness. Otherwise 457 if ecx is non-zero there is 4k False Alaising so do backward 458 copy. */ 459 addl %r8d, %ecx 460 jz L(more_8x_vec_backward) 461 462 /* if rdx is greater than __x86_shared_non_temporal_threshold 463 but there is overlap, or from short distance movsb. */ 464L(more_8x_vec_forward): 465 /* Load first and last 4 * VEC to support overlapping addresses. 466 */ 467 468 /* First vec was already loaded into VEC(0). */ 469 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) 470 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) 471 /* Save begining of dst. */ 472 movq %rdi, %rcx 473 /* Align dst to VEC_SIZE - 1. */ 474 orq $(VEC_SIZE - 1), %rdi 475 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) 476 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) 477 478 /* Subtract dst from src. Add back after dst aligned. */ 479 subq %rcx, %rsi 480 /* Finish aligning dst. */ 481 incq %rdi 482 /* Restore src adjusted with new value for aligned dst. */ 483 addq %rdi, %rsi 484 /* Store end of buffer minus tail in rdx. */ 485 leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx 486 487 /* Dont use multi-byte nop to align. */ 488 .p2align 4,, 11 489L(loop_4x_vec_forward): 490 /* Copy 4 * VEC a time forward. */ 491 VMOVU (%rsi), %VEC(1) 492 VMOVU VEC_SIZE(%rsi), %VEC(2) 493 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) 494 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) 495 subq $-(VEC_SIZE * 4), %rsi 496 VMOVA %VEC(1), (%rdi) 497 VMOVA %VEC(2), VEC_SIZE(%rdi) 498 VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) 499 VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) 500 subq $-(VEC_SIZE * 4), %rdi 501 cmpq %rdi, %rdx 502 ja L(loop_4x_vec_forward) 503 /* Store the last 4 * VEC. */ 504 VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) 505 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) 506 VMOVU %VEC(7), VEC_SIZE(%rdx) 507 VMOVU %VEC(8), (%rdx) 508 /* Store the first VEC. */ 509 VMOVU %VEC(0), (%rcx) 510 /* Keep L(nop_backward) target close to jmp for 2-byte encoding. 511 */ 512L(nop_backward): 513 VZEROUPPER_RETURN 514 515 .p2align 4,, 8 516L(more_8x_vec_backward_check_nop): 517 /* rcx contains dst - src. Test for dst == src to skip all of 518 memmove. */ 519 testq %rcx, %rcx 520 jz L(nop_backward) 521L(more_8x_vec_backward): 522 /* Load the first 4 * VEC and last VEC to support overlapping 523 addresses. */ 524 525 /* First vec was also loaded into VEC(0). */ 526 VMOVU VEC_SIZE(%rsi), %VEC(5) 527 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) 528 /* Begining of region for 4x backward copy stored in rcx. */ 529 leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx 530 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) 531 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) 532 /* Subtract dst from src. Add back after dst aligned. */ 533 subq %rdi, %rsi 534 /* Align dst. */ 535 andq $-(VEC_SIZE), %rcx 536 /* Restore src. */ 537 addq %rcx, %rsi 538 539 /* Don't use multi-byte nop to align. */ 540 .p2align 4,, 11 541L(loop_4x_vec_backward): 542 /* Copy 4 * VEC a time backward. */ 543 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) 544 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 545 VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) 546 VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) 547 addq $(VEC_SIZE * -4), %rsi 548 VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) 549 VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) 550 VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) 551 VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) 552 addq $(VEC_SIZE * -4), %rcx 553 cmpq %rcx, %rdi 554 jb L(loop_4x_vec_backward) 555 /* Store the first 4 * VEC. */ 556 VMOVU %VEC(0), (%rdi) 557 VMOVU %VEC(5), VEC_SIZE(%rdi) 558 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) 559 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) 560 /* Store the last VEC. */ 561 VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) 562 VZEROUPPER_RETURN 563 564#if defined USE_MULTIARCH && IS_IN (libc) 565 /* L(skip_short_movsb_check) is only used with ERMS. Not for 566 FSRM. */ 567 .p2align 5,, 16 568# if ALIGN_MOVSB 569L(skip_short_movsb_check): 570# if MOVSB_ALIGN_TO > VEC_SIZE 571 VMOVU VEC_SIZE(%rsi), %VEC(1) 572# endif 573# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) 574# error Unsupported MOVSB_ALIGN_TO 575# endif 576 /* If CPU does not have FSRM two options for aligning. Align src 577 if dst and src 4k alias. Otherwise align dst. */ 578 testl $(PAGE_SIZE - 512), %ecx 579 jnz L(movsb_align_dst) 580 /* Fall through. dst and src 4k alias. It's better to align src 581 here because the bottleneck will be loads dues to the false 582 dependency on dst. */ 583 584 /* rcx already has dst - src. */ 585 movq %rcx, %r9 586 /* Add src to len. Subtract back after src aligned. -1 because 587 src is initially aligned to MOVSB_ALIGN_TO - 1. */ 588 leaq -1(%rsi, %rdx), %rcx 589 /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ 590 orq $(MOVSB_ALIGN_TO - 1), %rsi 591 /* Restore dst and len adjusted with new values for aligned dst. 592 */ 593 leaq 1(%rsi, %r9), %rdi 594 subq %rsi, %rcx 595 /* Finish aligning src. */ 596 incq %rsi 597 598 rep movsb 599 600 VMOVU %VEC(0), (%r8) 601# if MOVSB_ALIGN_TO > VEC_SIZE 602 VMOVU %VEC(1), VEC_SIZE(%r8) 603# endif 604 VZEROUPPER_RETURN 605# endif 606 607 .p2align 4,, 12 608L(movsb): 609 movq %rdi, %rcx 610 subq %rsi, %rcx 611 /* Go to backwards temporal copy if overlap no matter what as 612 backward REP MOVSB is slow and we don't want to use NT stores if 613 there is overlap. */ 614 cmpq %rdx, %rcx 615 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ 616 jb L(more_8x_vec_backward_check_nop) 617# if ALIGN_MOVSB 618 /* Save dest for storing aligning VECs later. */ 619 movq %rdi, %r8 620# endif 621 /* If above __x86_rep_movsb_stop_threshold most likely is 622 candidate for NT moves aswell. */ 623 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP 624 jae L(large_memcpy_2x_check) 625# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB 626 /* Only avoid short movsb if CPU has FSRM. */ 627 testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) 628 jz L(skip_short_movsb_check) 629# if AVOID_SHORT_DISTANCE_REP_MOVSB 630 /* Avoid "rep movsb" if RCX, the distance between source and 631 destination, is N*4GB + [1..63] with N >= 0. */ 632 633 /* ecx contains dst - src. Early check for backward copy 634 conditions means only case of slow movsb with src = dst + [0, 635 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check 636 for that case. */ 637 cmpl $-64, %ecx 638 ja L(more_8x_vec_forward) 639# endif 640# endif 641# if ALIGN_MOVSB 642# if MOVSB_ALIGN_TO > VEC_SIZE 643 VMOVU VEC_SIZE(%rsi), %VEC(1) 644# endif 645# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) 646# error Unsupported MOVSB_ALIGN_TO 647# endif 648 /* Fall through means cpu has FSRM. In that case exclusively 649 align destination. */ 650L(movsb_align_dst): 651 /* Subtract dst from src. Add back after dst aligned. */ 652 subq %rdi, %rsi 653 /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ 654 addq $(MOVSB_ALIGN_TO - 1), %rdi 655 /* Add dst to len. Subtract back after dst aligned. */ 656 leaq (%r8, %rdx), %rcx 657 /* Finish aligning dst. */ 658 andq $-(MOVSB_ALIGN_TO), %rdi 659 /* Restore src and len adjusted with new values for aligned dst. 660 */ 661 addq %rdi, %rsi 662 subq %rdi, %rcx 663 664 rep movsb 665 666 /* Store VECs loaded for aligning. */ 667 VMOVU %VEC(0), (%r8) 668# if MOVSB_ALIGN_TO > VEC_SIZE 669 VMOVU %VEC(1), VEC_SIZE(%r8) 670# endif 671 VZEROUPPER_RETURN 672# else /* !ALIGN_MOVSB. */ 673L(skip_short_movsb_check): 674 mov %RDX_LP, %RCX_LP 675 rep movsb 676 ret 677# endif 678#endif 679 680 .p2align 4,, 10 681#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) 682L(large_memcpy_2x_check): 683 /* Entry from L(large_memcpy_2x) has a redundant load of 684 __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) 685 is only use for the non-erms memmove which is generally less 686 common. */ 687L(large_memcpy_2x): 688 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP 689 cmp %R11_LP, %RDX_LP 690 jb L(more_8x_vec_check) 691 /* To reach this point it is impossible for dst > src and 692 overlap. Remaining to check is src > dst and overlap. rcx 693 already contains dst - src. Negate rcx to get src - dst. If 694 length > rcx then there is overlap and forward copy is best. */ 695 negq %rcx 696 cmpq %rcx, %rdx 697 ja L(more_8x_vec_forward) 698 699 /* Cache align destination. First store the first 64 bytes then 700 adjust alignments. */ 701 702 /* First vec was also loaded into VEC(0). */ 703# if VEC_SIZE < 64 704 VMOVU VEC_SIZE(%rsi), %VEC(1) 705# if VEC_SIZE < 32 706 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 707 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 708# endif 709# endif 710 VMOVU %VEC(0), (%rdi) 711# if VEC_SIZE < 64 712 VMOVU %VEC(1), VEC_SIZE(%rdi) 713# if VEC_SIZE < 32 714 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) 715 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) 716# endif 717# endif 718 719 /* Adjust source, destination, and size. */ 720 movq %rdi, %r8 721 andq $63, %r8 722 /* Get the negative of offset for alignment. */ 723 subq $64, %r8 724 /* Adjust source. */ 725 subq %r8, %rsi 726 /* Adjust destination which should be aligned now. */ 727 subq %r8, %rdi 728 /* Adjust length. */ 729 addq %r8, %rdx 730 731 /* Test if source and destination addresses will alias. If they 732 do the larger pipeline in large_memcpy_4x alleviated the 733 performance drop. */ 734 735 /* ecx contains -(dst - src). not ecx will return dst - src - 1 736 which works for testing aliasing. */ 737 notl %ecx 738 movq %rdx, %r10 739 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx 740 jz L(large_memcpy_4x) 741 742 /* r11 has __x86_shared_non_temporal_threshold. Shift it left 743 by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. 744 */ 745 shlq $LOG_4X_MEMCPY_THRESH, %r11 746 cmp %r11, %rdx 747 jae L(large_memcpy_4x) 748 749 /* edx will store remainder size for copying tail. */ 750 andl $(PAGE_SIZE * 2 - 1), %edx 751 /* r10 stores outer loop counter. */ 752 shrq $(LOG_PAGE_SIZE + 1), %r10 753 /* Copy 4x VEC at a time from 2 pages. */ 754 .p2align 4 755L(loop_large_memcpy_2x_outer): 756 /* ecx stores inner loop counter. */ 757 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx 758L(loop_large_memcpy_2x_inner): 759 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) 760 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) 761 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) 762 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) 763 /* Load vectors from rsi. */ 764 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 765 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 766 subq $-LARGE_LOAD_SIZE, %rsi 767 /* Non-temporal store vectors to rdi. */ 768 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 769 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 770 subq $-LARGE_LOAD_SIZE, %rdi 771 decl %ecx 772 jnz L(loop_large_memcpy_2x_inner) 773 addq $PAGE_SIZE, %rdi 774 addq $PAGE_SIZE, %rsi 775 decq %r10 776 jne L(loop_large_memcpy_2x_outer) 777 sfence 778 779 /* Check if only last 4 loads are needed. */ 780 cmpl $(VEC_SIZE * 4), %edx 781 jbe L(large_memcpy_2x_end) 782 783 /* Handle the last 2 * PAGE_SIZE bytes. */ 784L(loop_large_memcpy_2x_tail): 785 /* Copy 4 * VEC a time forward with non-temporal stores. */ 786 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) 787 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) 788 VMOVU (%rsi), %VEC(0) 789 VMOVU VEC_SIZE(%rsi), %VEC(1) 790 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 791 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 792 subq $-(VEC_SIZE * 4), %rsi 793 addl $-(VEC_SIZE * 4), %edx 794 VMOVA %VEC(0), (%rdi) 795 VMOVA %VEC(1), VEC_SIZE(%rdi) 796 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) 797 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) 798 subq $-(VEC_SIZE * 4), %rdi 799 cmpl $(VEC_SIZE * 4), %edx 800 ja L(loop_large_memcpy_2x_tail) 801 802L(large_memcpy_2x_end): 803 /* Store the last 4 * VEC. */ 804 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) 805 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) 806 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) 807 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) 808 809 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) 810 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) 811 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) 812 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) 813 VZEROUPPER_RETURN 814 815 .p2align 4 816L(large_memcpy_4x): 817 /* edx will store remainder size for copying tail. */ 818 andl $(PAGE_SIZE * 4 - 1), %edx 819 /* r10 stores outer loop counter. */ 820 shrq $(LOG_PAGE_SIZE + 2), %r10 821 /* Copy 4x VEC at a time from 4 pages. */ 822 .p2align 4 823L(loop_large_memcpy_4x_outer): 824 /* ecx stores inner loop counter. */ 825 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx 826L(loop_large_memcpy_4x_inner): 827 /* Only one prefetch set per page as doing 4 pages give more 828 time for prefetcher to keep up. */ 829 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) 830 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) 831 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) 832 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) 833 /* Load vectors from rsi. */ 834 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 835 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 836 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) 837 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) 838 subq $-LARGE_LOAD_SIZE, %rsi 839 /* Non-temporal store vectors to rdi. */ 840 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 841 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 842 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) 843 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) 844 subq $-LARGE_LOAD_SIZE, %rdi 845 decl %ecx 846 jnz L(loop_large_memcpy_4x_inner) 847 addq $(PAGE_SIZE * 3), %rdi 848 addq $(PAGE_SIZE * 3), %rsi 849 decq %r10 850 jne L(loop_large_memcpy_4x_outer) 851 sfence 852 /* Check if only last 4 loads are needed. */ 853 cmpl $(VEC_SIZE * 4), %edx 854 jbe L(large_memcpy_4x_end) 855 856 /* Handle the last 4 * PAGE_SIZE bytes. */ 857L(loop_large_memcpy_4x_tail): 858 /* Copy 4 * VEC a time forward with non-temporal stores. */ 859 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) 860 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) 861 VMOVU (%rsi), %VEC(0) 862 VMOVU VEC_SIZE(%rsi), %VEC(1) 863 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 864 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 865 subq $-(VEC_SIZE * 4), %rsi 866 addl $-(VEC_SIZE * 4), %edx 867 VMOVA %VEC(0), (%rdi) 868 VMOVA %VEC(1), VEC_SIZE(%rdi) 869 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) 870 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) 871 subq $-(VEC_SIZE * 4), %rdi 872 cmpl $(VEC_SIZE * 4), %edx 873 ja L(loop_large_memcpy_4x_tail) 874 875L(large_memcpy_4x_end): 876 /* Store the last 4 * VEC. */ 877 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) 878 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) 879 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) 880 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) 881 882 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) 883 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) 884 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) 885 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) 886 VZEROUPPER_RETURN 887#endif 888END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) 889 890#if IS_IN (libc) 891# ifdef USE_MULTIARCH 892strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 893 MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) 894# ifdef SHARED 895strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), 896 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) 897# endif 898# endif 899# ifdef SHARED 900strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), 901 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) 902# endif 903#endif 904strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), 905 MEMCPY_SYMBOL (__memcpy, unaligned)) 906