1/* memmove/memcpy/mempcpy optimized for aligned access with SSSE3. 2 All versions must be listed in ifunc-impl-list.c. 3 Copyright (C) 2022 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20 21#include <isa-level.h> 22 23#if ISA_SHOULD_BUILD (2) 24 25# include <sysdep.h> 26# ifndef MEMMOVE 27# define MEMMOVE __memmove_ssse3 28# define MEMMOVE_CHK __memmove_chk_ssse3 29# define MEMCPY __memcpy_ssse3 30# define MEMCPY_CHK __memcpy_chk_ssse3 31# define MEMPCPY __mempcpy_ssse3 32# define MEMPCPY_CHK __mempcpy_chk_ssse3 33# endif 34 35 .section .text.ssse3, "ax", @progbits 36# if defined SHARED 37ENTRY(MEMPCPY_CHK) 38 cmp %RDX_LP, %RCX_LP 39 jb HIDDEN_JUMPTARGET(__chk_fail) 40END(MEMPCPY_CHK) 41# endif 42 43ENTRY(MEMPCPY) 44 mov %RDI_LP, %RAX_LP 45 add %RDX_LP, %RAX_LP 46 jmp L(start) 47END(MEMPCPY) 48 49# if defined SHARED 50ENTRY(MEMMOVE_CHK) 51 cmp %RDX_LP, %RCX_LP 52 jb HIDDEN_JUMPTARGET(__chk_fail) 53END(MEMMOVE_CHK) 54# endif 55 56ENTRY_P2ALIGN(MEMMOVE, 6) 57# ifdef __ILP32__ 58 /* Clear the upper 32 bits. */ 59 movl %edx, %edx 60# endif 61 movq %rdi, %rax 62L(start): 63 cmpq $16, %rdx 64 jb L(copy_0_15) 65 66 /* These loads are always useful. */ 67 movups 0(%rsi), %xmm0 68 movups -16(%rsi, %rdx), %xmm7 69 cmpq $32, %rdx 70 ja L(more_2x_vec) 71 72 movups %xmm0, 0(%rdi) 73 movups %xmm7, -16(%rdi, %rdx) 74 ret 75 76 .p2align 4,, 4 77L(copy_0_15): 78 cmpl $4, %edx 79 jb L(copy_0_3) 80 cmpl $8, %edx 81 jb L(copy_4_7) 82 movq 0(%rsi), %rcx 83 movq -8(%rsi, %rdx), %rsi 84 movq %rcx, 0(%rdi) 85 movq %rsi, -8(%rdi, %rdx) 86 ret 87 88 .p2align 4,, 4 89L(copy_4_7): 90 movl 0(%rsi), %ecx 91 movl -4(%rsi, %rdx), %esi 92 movl %ecx, 0(%rdi) 93 movl %esi, -4(%rdi, %rdx) 94 ret 95 96 .p2align 4,, 4 97L(copy_0_3): 98 decl %edx 99 jl L(copy_0_0) 100 movb (%rsi), %cl 101 je L(copy_1_1) 102 103 movzwl -1(%rsi, %rdx), %esi 104 movw %si, -1(%rdi, %rdx) 105L(copy_1_1): 106 movb %cl, (%rdi) 107L(copy_0_0): 108 ret 109 110 .p2align 4,, 4 111L(copy_4x_vec): 112 movups 16(%rsi), %xmm1 113 movups -32(%rsi, %rdx), %xmm2 114 115 movups %xmm0, 0(%rdi) 116 movups %xmm1, 16(%rdi) 117 movups %xmm2, -32(%rdi, %rdx) 118 movups %xmm7, -16(%rdi, %rdx) 119L(nop): 120 ret 121 122 .p2align 4 123L(more_2x_vec): 124 cmpq $64, %rdx 125 jbe L(copy_4x_vec) 126 127 /* We use rcx later to get alignr value. */ 128 movq %rdi, %rcx 129 130 /* Backward copy for overlap + dst > src for memmove safety. */ 131 subq %rsi, %rcx 132 cmpq %rdx, %rcx 133 jb L(copy_backward) 134 135 /* Load tail. */ 136 137 /* -16(%rsi, %rdx) already loaded into xmm7. */ 138 movups -32(%rsi, %rdx), %xmm8 139 movups -48(%rsi, %rdx), %xmm9 140 141 /* Get misalignment. */ 142 andl $0xf, %ecx 143 144 movq %rsi, %r9 145 addq %rcx, %rsi 146 andq $-16, %rsi 147 /* Get first vec for `palignr`. */ 148 movaps (%rsi), %xmm1 149 150 /* We have loaded (%rsi) so safe to do this store before the 151 loop. */ 152 movups %xmm0, (%rdi) 153 154# ifdef SHARED_CACHE_SIZE_HALF 155 cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP 156# else 157 cmp __x86_shared_cache_size_half(%rip), %rdx 158# endif 159 ja L(large_memcpy) 160 161 leaq -64(%rdi, %rdx), %r8 162 andq $-16, %rdi 163 movl $48, %edx 164 165 leaq L(loop_fwd_start)(%rip), %r9 166 sall $6, %ecx 167 addq %r9, %rcx 168 jmp * %rcx 169 170 .p2align 4,, 8 171L(copy_backward): 172 testq %rcx, %rcx 173 jz L(nop) 174 175 /* Preload tail. */ 176 177 /* (%rsi) already loaded into xmm0. */ 178 movups 16(%rsi), %xmm4 179 movups 32(%rsi), %xmm5 180 181 movq %rdi, %r8 182 subq %rdi, %rsi 183 leaq -49(%rdi, %rdx), %rdi 184 andq $-16, %rdi 185 addq %rdi, %rsi 186 andq $-16, %rsi 187 188 movaps 48(%rsi), %xmm6 189 190 191 leaq L(loop_bkwd_start)(%rip), %r9 192 andl $0xf, %ecx 193 sall $6, %ecx 194 addq %r9, %rcx 195 jmp * %rcx 196 197 .p2align 4,, 8 198L(large_memcpy): 199 movups -64(%r9, %rdx), %xmm10 200 movups -80(%r9, %rdx), %xmm11 201 202 sall $5, %ecx 203 leal (%rcx, %rcx, 2), %r8d 204 leaq -96(%rdi, %rdx), %rcx 205 andq $-16, %rdi 206 leaq L(large_loop_fwd_start)(%rip), %rdx 207 addq %r8, %rdx 208 jmp * %rdx 209 210 211 /* Instead of a typical jump table all 16 loops are exactly 212 64-bytes in size. So, we can just jump to first loop + r8 * 213 64. Before modifying any loop ensure all their sizes match! 214 */ 215 .p2align 6 216L(loop_fwd_start): 217L(loop_fwd_0x0): 218 movaps 16(%rsi), %xmm1 219 movaps 32(%rsi), %xmm2 220 movaps 48(%rsi), %xmm3 221 movaps %xmm1, 16(%rdi) 222 movaps %xmm2, 32(%rdi) 223 movaps %xmm3, 48(%rdi) 224 addq %rdx, %rdi 225 addq %rdx, %rsi 226 cmpq %rdi, %r8 227 ja L(loop_fwd_0x0) 228L(end_loop_fwd): 229 movups %xmm9, 16(%r8) 230 movups %xmm8, 32(%r8) 231 movups %xmm7, 48(%r8) 232 ret 233 234 /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. 235 60 bytes otherwise. */ 236# define ALIGNED_LOOP_FWD(align_by); \ 237 .p2align 6; \ 238L(loop_fwd_ ## align_by): \ 239 movaps 16(%rsi), %xmm0; \ 240 movaps 32(%rsi), %xmm2; \ 241 movaps 48(%rsi), %xmm3; \ 242 movaps %xmm3, %xmm4; \ 243 palignr $align_by, %xmm2, %xmm3; \ 244 palignr $align_by, %xmm0, %xmm2; \ 245 palignr $align_by, %xmm1, %xmm0; \ 246 movaps %xmm4, %xmm1; \ 247 movaps %xmm0, 16(%rdi); \ 248 movaps %xmm2, 32(%rdi); \ 249 movaps %xmm3, 48(%rdi); \ 250 addq %rdx, %rdi; \ 251 addq %rdx, %rsi; \ 252 cmpq %rdi, %r8; \ 253 ja L(loop_fwd_ ## align_by); \ 254 jmp L(end_loop_fwd); 255 256 /* Must be in descending order. */ 257 ALIGNED_LOOP_FWD (0xf) 258 ALIGNED_LOOP_FWD (0xe) 259 ALIGNED_LOOP_FWD (0xd) 260 ALIGNED_LOOP_FWD (0xc) 261 ALIGNED_LOOP_FWD (0xb) 262 ALIGNED_LOOP_FWD (0xa) 263 ALIGNED_LOOP_FWD (0x9) 264 ALIGNED_LOOP_FWD (0x8) 265 ALIGNED_LOOP_FWD (0x7) 266 ALIGNED_LOOP_FWD (0x6) 267 ALIGNED_LOOP_FWD (0x5) 268 ALIGNED_LOOP_FWD (0x4) 269 ALIGNED_LOOP_FWD (0x3) 270 ALIGNED_LOOP_FWD (0x2) 271 ALIGNED_LOOP_FWD (0x1) 272 273 .p2align 6 274L(large_loop_fwd_start): 275L(large_loop_fwd_0x0): 276 movaps 16(%rsi), %xmm1 277 movaps 32(%rsi), %xmm2 278 movaps 48(%rsi), %xmm3 279 movaps 64(%rsi), %xmm4 280 movaps 80(%rsi), %xmm5 281 movntps %xmm1, 16(%rdi) 282 movntps %xmm2, 32(%rdi) 283 movntps %xmm3, 48(%rdi) 284 movntps %xmm4, 64(%rdi) 285 movntps %xmm5, 80(%rdi) 286 addq $80, %rdi 287 addq $80, %rsi 288 cmpq %rdi, %rcx 289 ja L(large_loop_fwd_0x0) 290 291 /* Ensure no icache line split on tail. */ 292 .p2align 4 293L(end_large_loop_fwd): 294 sfence 295 movups %xmm11, 16(%rcx) 296 movups %xmm10, 32(%rcx) 297 movups %xmm9, 48(%rcx) 298 movups %xmm8, 64(%rcx) 299 movups %xmm7, 80(%rcx) 300 ret 301 302 303 /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure 304 96-byte spacing between each. */ 305# define ALIGNED_LARGE_LOOP_FWD(align_by); \ 306 .p2align 5; \ 307L(large_loop_fwd_ ## align_by): \ 308 movaps 16(%rsi), %xmm0; \ 309 movaps 32(%rsi), %xmm2; \ 310 movaps 48(%rsi), %xmm3; \ 311 movaps 64(%rsi), %xmm4; \ 312 movaps 80(%rsi), %xmm5; \ 313 movaps %xmm5, %xmm6; \ 314 palignr $align_by, %xmm4, %xmm5; \ 315 palignr $align_by, %xmm3, %xmm4; \ 316 palignr $align_by, %xmm2, %xmm3; \ 317 palignr $align_by, %xmm0, %xmm2; \ 318 palignr $align_by, %xmm1, %xmm0; \ 319 movaps %xmm6, %xmm1; \ 320 movntps %xmm0, 16(%rdi); \ 321 movntps %xmm2, 32(%rdi); \ 322 movntps %xmm3, 48(%rdi); \ 323 movntps %xmm4, 64(%rdi); \ 324 movntps %xmm5, 80(%rdi); \ 325 addq $80, %rdi; \ 326 addq $80, %rsi; \ 327 cmpq %rdi, %rcx; \ 328 ja L(large_loop_fwd_ ## align_by); \ 329 jmp L(end_large_loop_fwd); 330 331 /* Must be in descending order. */ 332 ALIGNED_LARGE_LOOP_FWD (0xf) 333 ALIGNED_LARGE_LOOP_FWD (0xe) 334 ALIGNED_LARGE_LOOP_FWD (0xd) 335 ALIGNED_LARGE_LOOP_FWD (0xc) 336 ALIGNED_LARGE_LOOP_FWD (0xb) 337 ALIGNED_LARGE_LOOP_FWD (0xa) 338 ALIGNED_LARGE_LOOP_FWD (0x9) 339 ALIGNED_LARGE_LOOP_FWD (0x8) 340 ALIGNED_LARGE_LOOP_FWD (0x7) 341 ALIGNED_LARGE_LOOP_FWD (0x6) 342 ALIGNED_LARGE_LOOP_FWD (0x5) 343 ALIGNED_LARGE_LOOP_FWD (0x4) 344 ALIGNED_LARGE_LOOP_FWD (0x3) 345 ALIGNED_LARGE_LOOP_FWD (0x2) 346 ALIGNED_LARGE_LOOP_FWD (0x1) 347 348 349 .p2align 6 350L(loop_bkwd_start): 351L(loop_bkwd_0x0): 352 movaps 32(%rsi), %xmm1 353 movaps 16(%rsi), %xmm2 354 movaps 0(%rsi), %xmm3 355 movaps %xmm1, 32(%rdi) 356 movaps %xmm2, 16(%rdi) 357 movaps %xmm3, 0(%rdi) 358 subq $48, %rdi 359 subq $48, %rsi 360 cmpq %rdi, %r8 361 jb L(loop_bkwd_0x0) 362L(end_loop_bkwd): 363 movups %xmm7, -16(%r8, %rdx) 364 movups %xmm0, 0(%r8) 365 movups %xmm4, 16(%r8) 366 movups %xmm5, 32(%r8) 367 368 ret 369 370 371 /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. 372 60 bytes otherwise. */ 373# define ALIGNED_LOOP_BKWD(align_by); \ 374 .p2align 6; \ 375L(loop_bkwd_ ## align_by): \ 376 movaps 32(%rsi), %xmm1; \ 377 movaps 16(%rsi), %xmm2; \ 378 movaps 0(%rsi), %xmm3; \ 379 palignr $align_by, %xmm1, %xmm6; \ 380 palignr $align_by, %xmm2, %xmm1; \ 381 palignr $align_by, %xmm3, %xmm2; \ 382 movaps %xmm6, 32(%rdi); \ 383 movaps %xmm1, 16(%rdi); \ 384 movaps %xmm2, 0(%rdi); \ 385 subq $48, %rdi; \ 386 subq $48, %rsi; \ 387 movaps %xmm3, %xmm6; \ 388 cmpq %rdi, %r8; \ 389 jb L(loop_bkwd_ ## align_by); \ 390 jmp L(end_loop_bkwd); 391 392 /* Must be in descending order. */ 393 ALIGNED_LOOP_BKWD (0xf) 394 ALIGNED_LOOP_BKWD (0xe) 395 ALIGNED_LOOP_BKWD (0xd) 396 ALIGNED_LOOP_BKWD (0xc) 397 ALIGNED_LOOP_BKWD (0xb) 398 ALIGNED_LOOP_BKWD (0xa) 399 ALIGNED_LOOP_BKWD (0x9) 400 ALIGNED_LOOP_BKWD (0x8) 401 ALIGNED_LOOP_BKWD (0x7) 402 ALIGNED_LOOP_BKWD (0x6) 403 ALIGNED_LOOP_BKWD (0x5) 404 ALIGNED_LOOP_BKWD (0x4) 405 ALIGNED_LOOP_BKWD (0x3) 406 ALIGNED_LOOP_BKWD (0x2) 407 ALIGNED_LOOP_BKWD (0x1) 408END(MEMMOVE) 409 410strong_alias (MEMMOVE, MEMCPY) 411# if defined SHARED 412strong_alias (MEMMOVE_CHK, MEMCPY_CHK) 413# endif 414#endif 415