1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. 2 Copyright (C) 2016-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <isa-level.h> 21 22#if ISA_SHOULD_BUILD (4) 23 24# include "asm-syntax.h" 25 26 .section .text.avx512,"ax",@progbits 27ENTRY (__mempcpy_chk_avx512_no_vzeroupper) 28 cmp %RDX_LP, %RCX_LP 29 jb HIDDEN_JUMPTARGET (__chk_fail) 30END (__mempcpy_chk_avx512_no_vzeroupper) 31 32ENTRY (__mempcpy_avx512_no_vzeroupper) 33 mov %RDI_LP, %RAX_LP 34 add %RDX_LP, %RAX_LP 35 jmp L(start) 36END (__mempcpy_avx512_no_vzeroupper) 37 38ENTRY (__memmove_chk_avx512_no_vzeroupper) 39 cmp %RDX_LP, %RCX_LP 40 jb HIDDEN_JUMPTARGET (__chk_fail) 41END (__memmove_chk_avx512_no_vzeroupper) 42 43ENTRY (__memmove_avx512_no_vzeroupper) 44 mov %RDI_LP, %RAX_LP 45# ifdef USE_AS_MEMPCPY 46 add %RDX_LP, %RAX_LP 47# endif 48L(start): 49# ifdef __ILP32__ 50 /* Clear the upper 32 bits. */ 51 mov %edx, %edx 52# endif 53 lea (%rsi, %rdx), %rcx 54 lea (%rdi, %rdx), %r9 55 cmp $512, %rdx 56 ja L(512bytesormore) 57 58L(check): 59 cmp $16, %rdx 60 jbe L(less_16bytes) 61 cmp $256, %rdx 62 jb L(less_256bytes) 63 vmovups (%rsi), %zmm0 64 vmovups 0x40(%rsi), %zmm1 65 vmovups 0x80(%rsi), %zmm2 66 vmovups 0xC0(%rsi), %zmm3 67 vmovups -0x100(%rcx), %zmm4 68 vmovups -0xC0(%rcx), %zmm5 69 vmovups -0x80(%rcx), %zmm6 70 vmovups -0x40(%rcx), %zmm7 71 vmovups %zmm0, (%rdi) 72 vmovups %zmm1, 0x40(%rdi) 73 vmovups %zmm2, 0x80(%rdi) 74 vmovups %zmm3, 0xC0(%rdi) 75 vmovups %zmm4, -0x100(%r9) 76 vmovups %zmm5, -0xC0(%r9) 77 vmovups %zmm6, -0x80(%r9) 78 vmovups %zmm7, -0x40(%r9) 79 ret 80 81L(less_256bytes): 82 cmp $128, %dl 83 jb L(less_128bytes) 84 vmovups (%rsi), %zmm0 85 vmovups 0x40(%rsi), %zmm1 86 vmovups -0x80(%rcx), %zmm2 87 vmovups -0x40(%rcx), %zmm3 88 vmovups %zmm0, (%rdi) 89 vmovups %zmm1, 0x40(%rdi) 90 vmovups %zmm2, -0x80(%r9) 91 vmovups %zmm3, -0x40(%r9) 92 ret 93 94L(less_128bytes): 95 cmp $64, %dl 96 jb L(less_64bytes) 97 vmovdqu (%rsi), %ymm0 98 vmovdqu 0x20(%rsi), %ymm1 99 vmovdqu -0x40(%rcx), %ymm2 100 vmovdqu -0x20(%rcx), %ymm3 101 vmovdqu %ymm0, (%rdi) 102 vmovdqu %ymm1, 0x20(%rdi) 103 vmovdqu %ymm2, -0x40(%r9) 104 vmovdqu %ymm3, -0x20(%r9) 105 ret 106 107L(less_64bytes): 108 cmp $32, %dl 109 jb L(less_32bytes) 110 vmovdqu (%rsi), %ymm0 111 vmovdqu -0x20(%rcx), %ymm1 112 vmovdqu %ymm0, (%rdi) 113 vmovdqu %ymm1, -0x20(%r9) 114 ret 115 116L(less_32bytes): 117 vmovdqu (%rsi), %xmm0 118 vmovdqu -0x10(%rcx), %xmm1 119 vmovdqu %xmm0, (%rdi) 120 vmovdqu %xmm1, -0x10(%r9) 121 ret 122 123L(less_16bytes): 124 cmp $8, %dl 125 jb L(less_8bytes) 126 movq (%rsi), %rsi 127 movq -0x8(%rcx), %rcx 128 movq %rsi, (%rdi) 129 movq %rcx, -0x8(%r9) 130 ret 131 132L(less_8bytes): 133 cmp $4, %dl 134 jb L(less_4bytes) 135 mov (%rsi), %esi 136 mov -0x4(%rcx), %ecx 137 mov %esi, (%rdi) 138 mov %ecx, -0x4(%r9) 139 ret 140 141L(less_4bytes): 142 cmp $2, %dl 143 jb L(less_2bytes) 144 mov (%rsi), %si 145 mov -0x2(%rcx), %cx 146 mov %si, (%rdi) 147 mov %cx, -0x2(%r9) 148 ret 149 150L(less_2bytes): 151 cmp $1, %dl 152 jb L(less_1bytes) 153 mov (%rsi), %cl 154 mov %cl, (%rdi) 155L(less_1bytes): 156 ret 157 158L(512bytesormore): 159# ifdef SHARED_CACHE_SIZE_HALF 160 mov $SHARED_CACHE_SIZE_HALF, %r8 161# else 162 mov __x86_shared_cache_size_half(%rip), %r8 163# endif 164 cmp %r8, %rdx 165 jae L(preloop_large) 166 cmp $1024, %rdx 167 ja L(1024bytesormore) 168 prefetcht1 (%rsi) 169 prefetcht1 0x40(%rsi) 170 prefetcht1 0x80(%rsi) 171 prefetcht1 0xC0(%rsi) 172 prefetcht1 0x100(%rsi) 173 prefetcht1 0x140(%rsi) 174 prefetcht1 0x180(%rsi) 175 prefetcht1 0x1C0(%rsi) 176 prefetcht1 -0x200(%rcx) 177 prefetcht1 -0x1C0(%rcx) 178 prefetcht1 -0x180(%rcx) 179 prefetcht1 -0x140(%rcx) 180 prefetcht1 -0x100(%rcx) 181 prefetcht1 -0xC0(%rcx) 182 prefetcht1 -0x80(%rcx) 183 prefetcht1 -0x40(%rcx) 184 vmovups (%rsi), %zmm0 185 vmovups 0x40(%rsi), %zmm1 186 vmovups 0x80(%rsi), %zmm2 187 vmovups 0xC0(%rsi), %zmm3 188 vmovups 0x100(%rsi), %zmm4 189 vmovups 0x140(%rsi), %zmm5 190 vmovups 0x180(%rsi), %zmm6 191 vmovups 0x1C0(%rsi), %zmm7 192 vmovups -0x200(%rcx), %zmm8 193 vmovups -0x1C0(%rcx), %zmm9 194 vmovups -0x180(%rcx), %zmm10 195 vmovups -0x140(%rcx), %zmm11 196 vmovups -0x100(%rcx), %zmm12 197 vmovups -0xC0(%rcx), %zmm13 198 vmovups -0x80(%rcx), %zmm14 199 vmovups -0x40(%rcx), %zmm15 200 vmovups %zmm0, (%rdi) 201 vmovups %zmm1, 0x40(%rdi) 202 vmovups %zmm2, 0x80(%rdi) 203 vmovups %zmm3, 0xC0(%rdi) 204 vmovups %zmm4, 0x100(%rdi) 205 vmovups %zmm5, 0x140(%rdi) 206 vmovups %zmm6, 0x180(%rdi) 207 vmovups %zmm7, 0x1C0(%rdi) 208 vmovups %zmm8, -0x200(%r9) 209 vmovups %zmm9, -0x1C0(%r9) 210 vmovups %zmm10, -0x180(%r9) 211 vmovups %zmm11, -0x140(%r9) 212 vmovups %zmm12, -0x100(%r9) 213 vmovups %zmm13, -0xC0(%r9) 214 vmovups %zmm14, -0x80(%r9) 215 vmovups %zmm15, -0x40(%r9) 216 ret 217 218L(1024bytesormore): 219 cmp %rsi, %rdi 220 ja L(1024bytesormore_bkw) 221 sub $512, %r9 222 vmovups -0x200(%rcx), %zmm8 223 vmovups -0x1C0(%rcx), %zmm9 224 vmovups -0x180(%rcx), %zmm10 225 vmovups -0x140(%rcx), %zmm11 226 vmovups -0x100(%rcx), %zmm12 227 vmovups -0xC0(%rcx), %zmm13 228 vmovups -0x80(%rcx), %zmm14 229 vmovups -0x40(%rcx), %zmm15 230 prefetcht1 (%rsi) 231 prefetcht1 0x40(%rsi) 232 prefetcht1 0x80(%rsi) 233 prefetcht1 0xC0(%rsi) 234 prefetcht1 0x100(%rsi) 235 prefetcht1 0x140(%rsi) 236 prefetcht1 0x180(%rsi) 237 prefetcht1 0x1C0(%rsi) 238 239/* Loop with unaligned memory access. */ 240L(gobble_512bytes_loop): 241 vmovups (%rsi), %zmm0 242 vmovups 0x40(%rsi), %zmm1 243 vmovups 0x80(%rsi), %zmm2 244 vmovups 0xC0(%rsi), %zmm3 245 vmovups 0x100(%rsi), %zmm4 246 vmovups 0x140(%rsi), %zmm5 247 vmovups 0x180(%rsi), %zmm6 248 vmovups 0x1C0(%rsi), %zmm7 249 add $512, %rsi 250 prefetcht1 (%rsi) 251 prefetcht1 0x40(%rsi) 252 prefetcht1 0x80(%rsi) 253 prefetcht1 0xC0(%rsi) 254 prefetcht1 0x100(%rsi) 255 prefetcht1 0x140(%rsi) 256 prefetcht1 0x180(%rsi) 257 prefetcht1 0x1C0(%rsi) 258 vmovups %zmm0, (%rdi) 259 vmovups %zmm1, 0x40(%rdi) 260 vmovups %zmm2, 0x80(%rdi) 261 vmovups %zmm3, 0xC0(%rdi) 262 vmovups %zmm4, 0x100(%rdi) 263 vmovups %zmm5, 0x140(%rdi) 264 vmovups %zmm6, 0x180(%rdi) 265 vmovups %zmm7, 0x1C0(%rdi) 266 add $512, %rdi 267 cmp %r9, %rdi 268 jb L(gobble_512bytes_loop) 269 vmovups %zmm8, (%r9) 270 vmovups %zmm9, 0x40(%r9) 271 vmovups %zmm10, 0x80(%r9) 272 vmovups %zmm11, 0xC0(%r9) 273 vmovups %zmm12, 0x100(%r9) 274 vmovups %zmm13, 0x140(%r9) 275 vmovups %zmm14, 0x180(%r9) 276 vmovups %zmm15, 0x1C0(%r9) 277 ret 278 279L(1024bytesormore_bkw): 280 add $512, %rdi 281 vmovups 0x1C0(%rsi), %zmm8 282 vmovups 0x180(%rsi), %zmm9 283 vmovups 0x140(%rsi), %zmm10 284 vmovups 0x100(%rsi), %zmm11 285 vmovups 0xC0(%rsi), %zmm12 286 vmovups 0x80(%rsi), %zmm13 287 vmovups 0x40(%rsi), %zmm14 288 vmovups (%rsi), %zmm15 289 prefetcht1 -0x40(%rcx) 290 prefetcht1 -0x80(%rcx) 291 prefetcht1 -0xC0(%rcx) 292 prefetcht1 -0x100(%rcx) 293 prefetcht1 -0x140(%rcx) 294 prefetcht1 -0x180(%rcx) 295 prefetcht1 -0x1C0(%rcx) 296 prefetcht1 -0x200(%rcx) 297 298/* Backward loop with unaligned memory access. */ 299L(gobble_512bytes_loop_bkw): 300 vmovups -0x40(%rcx), %zmm0 301 vmovups -0x80(%rcx), %zmm1 302 vmovups -0xC0(%rcx), %zmm2 303 vmovups -0x100(%rcx), %zmm3 304 vmovups -0x140(%rcx), %zmm4 305 vmovups -0x180(%rcx), %zmm5 306 vmovups -0x1C0(%rcx), %zmm6 307 vmovups -0x200(%rcx), %zmm7 308 sub $512, %rcx 309 prefetcht1 -0x40(%rcx) 310 prefetcht1 -0x80(%rcx) 311 prefetcht1 -0xC0(%rcx) 312 prefetcht1 -0x100(%rcx) 313 prefetcht1 -0x140(%rcx) 314 prefetcht1 -0x180(%rcx) 315 prefetcht1 -0x1C0(%rcx) 316 prefetcht1 -0x200(%rcx) 317 vmovups %zmm0, -0x40(%r9) 318 vmovups %zmm1, -0x80(%r9) 319 vmovups %zmm2, -0xC0(%r9) 320 vmovups %zmm3, -0x100(%r9) 321 vmovups %zmm4, -0x140(%r9) 322 vmovups %zmm5, -0x180(%r9) 323 vmovups %zmm6, -0x1C0(%r9) 324 vmovups %zmm7, -0x200(%r9) 325 sub $512, %r9 326 cmp %rdi, %r9 327 ja L(gobble_512bytes_loop_bkw) 328 vmovups %zmm8, -0x40(%rdi) 329 vmovups %zmm9, -0x80(%rdi) 330 vmovups %zmm10, -0xC0(%rdi) 331 vmovups %zmm11, -0x100(%rdi) 332 vmovups %zmm12, -0x140(%rdi) 333 vmovups %zmm13, -0x180(%rdi) 334 vmovups %zmm14, -0x1C0(%rdi) 335 vmovups %zmm15, -0x200(%rdi) 336 ret 337 338L(preloop_large): 339 cmp %rsi, %rdi 340 ja L(preloop_large_bkw) 341 vmovups (%rsi), %zmm4 342 vmovups 0x40(%rsi), %zmm5 343 344 mov %rdi, %r11 345/* Align destination for access with non-temporal stores in the loop. */ 346 mov %rdi, %r8 347 and $-0x80, %rdi 348 add $0x80, %rdi 349 sub %rdi, %r8 350 sub %r8, %rsi 351 add %r8, %rdx 352L(gobble_256bytes_nt_loop): 353 prefetcht1 0x200(%rsi) 354 prefetcht1 0x240(%rsi) 355 prefetcht1 0x280(%rsi) 356 prefetcht1 0x2C0(%rsi) 357 prefetcht1 0x300(%rsi) 358 prefetcht1 0x340(%rsi) 359 prefetcht1 0x380(%rsi) 360 prefetcht1 0x3C0(%rsi) 361 vmovdqu64 (%rsi), %zmm0 362 vmovdqu64 0x40(%rsi), %zmm1 363 vmovdqu64 0x80(%rsi), %zmm2 364 vmovdqu64 0xC0(%rsi), %zmm3 365 vmovntdq %zmm0, (%rdi) 366 vmovntdq %zmm1, 0x40(%rdi) 367 vmovntdq %zmm2, 0x80(%rdi) 368 vmovntdq %zmm3, 0xC0(%rdi) 369 sub $256, %rdx 370 add $256, %rsi 371 add $256, %rdi 372 cmp $256, %rdx 373 ja L(gobble_256bytes_nt_loop) 374 sfence 375 vmovups %zmm4, (%r11) 376 vmovups %zmm5, 0x40(%r11) 377 jmp L(check) 378 379L(preloop_large_bkw): 380 vmovups -0x80(%rcx), %zmm4 381 vmovups -0x40(%rcx), %zmm5 382 383/* Align end of destination for access with non-temporal stores. */ 384 mov %r9, %r8 385 and $-0x80, %r9 386 sub %r9, %r8 387 sub %r8, %rcx 388 sub %r8, %rdx 389 add %r9, %r8 390L(gobble_256bytes_nt_loop_bkw): 391 prefetcht1 -0x400(%rcx) 392 prefetcht1 -0x3C0(%rcx) 393 prefetcht1 -0x380(%rcx) 394 prefetcht1 -0x340(%rcx) 395 prefetcht1 -0x300(%rcx) 396 prefetcht1 -0x2C0(%rcx) 397 prefetcht1 -0x280(%rcx) 398 prefetcht1 -0x240(%rcx) 399 vmovdqu64 -0x100(%rcx), %zmm0 400 vmovdqu64 -0xC0(%rcx), %zmm1 401 vmovdqu64 -0x80(%rcx), %zmm2 402 vmovdqu64 -0x40(%rcx), %zmm3 403 vmovntdq %zmm0, -0x100(%r9) 404 vmovntdq %zmm1, -0xC0(%r9) 405 vmovntdq %zmm2, -0x80(%r9) 406 vmovntdq %zmm3, -0x40(%r9) 407 sub $256, %rdx 408 sub $256, %rcx 409 sub $256, %r9 410 cmp $256, %rdx 411 ja L(gobble_256bytes_nt_loop_bkw) 412 sfence 413 vmovups %zmm4, -0x80(%r8) 414 vmovups %zmm5, -0x40(%r8) 415 jmp L(check) 416END (__memmove_avx512_no_vzeroupper) 417 418strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) 419strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) 420#endif 421