1/* wcscpy with SSSE3 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* MINIMUM_X86_ISA_LEVEL <= 4 because there are not V3/V4 22 implementations so we need this to build for ISA V3/V4 23 builds. */ 24#if ISA_SHOULD_BUILD (4) 25 26# ifndef WCSCPY 27# define WCSCPY __wcscpy_ssse3 28# endif 29 30# include <sysdep.h> 31 32 .section .text.ssse3,"ax",@progbits 33ENTRY (WCSCPY) 34 35 mov %rsi, %rcx 36 mov %rdi, %rdx 37 38 cmpl $0, (%rcx) 39 jz L(Exit4) 40 cmpl $0, 4(%rcx) 41 jz L(Exit8) 42 cmpl $0, 8(%rcx) 43 jz L(Exit12) 44 cmpl $0, 12(%rcx) 45 jz L(Exit16) 46 47 lea 16(%rcx), %rsi 48 and $-16, %rsi 49 50 pxor %xmm0, %xmm0 51 mov (%rcx), %r9 52 mov %r9, (%rdx) 53 54 pcmpeqd (%rsi), %xmm0 55 mov 8(%rcx), %r9 56 mov %r9, 8(%rdx) 57 58 pmovmskb %xmm0, %rax 59 sub %rcx, %rsi 60 61 test %rax, %rax 62 jnz L(CopyFrom1To16Bytes) 63 64 mov %rdx, %rax 65 addq $16, %rdx 66 and $-16, %rdx 67 sub %rdx, %rax 68 sub %rax, %rcx 69 mov %rcx, %rax 70 and $0xf, %rax 71 mov $0, %rsi 72 73/* case: rcx_offset == rdx_offset */ 74 75 jz L(Align16Both) 76 77 cmp $4, %rax 78 je L(Shl4) 79 cmp $8, %rax 80 je L(Shl8) 81 jmp L(Shl12) 82 83L(Align16Both): 84 movaps (%rcx), %xmm1 85 movaps 16(%rcx), %xmm2 86 movaps %xmm1, (%rdx) 87 pcmpeqd %xmm2, %xmm0 88 pmovmskb %xmm0, %eax 89 addq $16, %rsi 90 91 test %eax, %eax 92 jnz L(CopyFrom1To16Bytes) 93 94 movaps 16(%rcx, %rsi), %xmm3 95 movaps %xmm2, (%rdx, %rsi) 96 pcmpeqd %xmm3, %xmm0 97 pmovmskb %xmm0, %eax 98 addq $16, %rsi 99 100 test %eax, %eax 101 jnz L(CopyFrom1To16Bytes) 102 103 movaps 16(%rcx, %rsi), %xmm4 104 movaps %xmm3, (%rdx, %rsi) 105 pcmpeqd %xmm4, %xmm0 106 pmovmskb %xmm0, %eax 107 addq $16, %rsi 108 109 test %eax, %eax 110 jnz L(CopyFrom1To16Bytes) 111 112 movaps 16(%rcx, %rsi), %xmm1 113 movaps %xmm4, (%rdx, %rsi) 114 pcmpeqd %xmm1, %xmm0 115 pmovmskb %xmm0, %eax 116 addq $16, %rsi 117 118 test %eax, %eax 119 jnz L(CopyFrom1To16Bytes) 120 121 movaps 16(%rcx, %rsi), %xmm2 122 movaps %xmm1, (%rdx, %rsi) 123 pcmpeqd %xmm2, %xmm0 124 pmovmskb %xmm0, %eax 125 addq $16, %rsi 126 127 test %eax, %eax 128 jnz L(CopyFrom1To16Bytes) 129 130 movaps 16(%rcx, %rsi), %xmm3 131 movaps %xmm2, (%rdx, %rsi) 132 pcmpeqd %xmm3, %xmm0 133 pmovmskb %xmm0, %eax 134 addq $16, %rsi 135 136 test %eax, %eax 137 jnz L(CopyFrom1To16Bytes) 138 139 movaps %xmm3, (%rdx, %rsi) 140 mov %rcx, %rax 141 lea 16(%rcx, %rsi), %rcx 142 and $-0x40, %rcx 143 sub %rcx, %rax 144 sub %rax, %rdx 145 146 mov $-0x40, %rsi 147 148 .p2align 4 149L(Aligned64Loop): 150 movaps (%rcx), %xmm2 151 movaps %xmm2, %xmm4 152 movaps 16(%rcx), %xmm5 153 movaps 32(%rcx), %xmm3 154 movaps %xmm3, %xmm6 155 movaps 48(%rcx), %xmm7 156 pminub %xmm5, %xmm2 157 pminub %xmm7, %xmm3 158 pminub %xmm2, %xmm3 159 pcmpeqd %xmm0, %xmm3 160 pmovmskb %xmm3, %eax 161 addq $64, %rdx 162 addq $64, %rcx 163 testl %eax, %eax 164 jnz L(Aligned64Leave) 165 movaps %xmm4, -64(%rdx) 166 movaps %xmm5, -48(%rdx) 167 movaps %xmm6, -32(%rdx) 168 movaps %xmm7, -16(%rdx) 169 jmp L(Aligned64Loop) 170 171L(Aligned64Leave): 172 pcmpeqd %xmm4, %xmm0 173 pmovmskb %xmm0, %eax 174 test %eax, %eax 175 jnz L(CopyFrom1To16Bytes) 176 177 pcmpeqd %xmm5, %xmm0 178 179 pmovmskb %xmm0, %eax 180 movaps %xmm4, -64(%rdx) 181 addq $16, %rsi 182 test %eax, %eax 183 jnz L(CopyFrom1To16Bytes) 184 185 pcmpeqd %xmm6, %xmm0 186 187 pmovmskb %xmm0, %eax 188 movaps %xmm5, -48(%rdx) 189 addq $16, %rsi 190 test %eax, %eax 191 jnz L(CopyFrom1To16Bytes) 192 193 movaps %xmm6, -32(%rdx) 194 pcmpeqd %xmm7, %xmm0 195 196 pmovmskb %xmm0, %eax 197 addq $16, %rsi 198 test %eax, %eax 199 jnz L(CopyFrom1To16Bytes) 200 201 mov $-0x40, %rsi 202 movaps %xmm7, -16(%rdx) 203 jmp L(Aligned64Loop) 204 205 .p2align 4 206L(Shl4): 207 movaps -4(%rcx), %xmm1 208 movaps 12(%rcx), %xmm2 209L(Shl4Start): 210 pcmpeqd %xmm2, %xmm0 211 pmovmskb %xmm0, %eax 212 movaps %xmm2, %xmm3 213 214 test %eax, %eax 215 jnz L(Shl4LoopExit) 216 217 palignr $4, %xmm1, %xmm2 218 movaps %xmm2, (%rdx) 219 movaps 28(%rcx), %xmm2 220 221 pcmpeqd %xmm2, %xmm0 222 addq $16, %rdx 223 pmovmskb %xmm0, %eax 224 addq $16, %rcx 225 movaps %xmm2, %xmm1 226 227 test %eax, %eax 228 jnz L(Shl4LoopExit) 229 230 palignr $4, %xmm3, %xmm2 231 movaps %xmm2, (%rdx) 232 movaps 28(%rcx), %xmm2 233 234 pcmpeqd %xmm2, %xmm0 235 addq $16, %rdx 236 pmovmskb %xmm0, %eax 237 addq $16, %rcx 238 movaps %xmm2, %xmm3 239 240 test %eax, %eax 241 jnz L(Shl4LoopExit) 242 243 palignr $4, %xmm1, %xmm2 244 movaps %xmm2, (%rdx) 245 movaps 28(%rcx), %xmm2 246 247 pcmpeqd %xmm2, %xmm0 248 addq $16, %rdx 249 pmovmskb %xmm0, %eax 250 addq $16, %rcx 251 252 test %eax, %eax 253 jnz L(Shl4LoopExit) 254 255 palignr $4, %xmm3, %xmm2 256 movaps %xmm2, (%rdx) 257 addq $28, %rcx 258 addq $16, %rdx 259 260 mov %rcx, %rax 261 and $-0x40, %rcx 262 sub %rcx, %rax 263 addq $-12, %rcx 264 sub %rax, %rdx 265 266 movaps -4(%rcx), %xmm1 267 268 .p2align 4 269L(Shl4LoopStart): 270 movaps 12(%rcx), %xmm2 271 movaps 28(%rcx), %xmm3 272 movaps %xmm3, %xmm6 273 movaps 44(%rcx), %xmm4 274 movaps %xmm4, %xmm7 275 movaps 60(%rcx), %xmm5 276 pminub %xmm2, %xmm6 277 pminub %xmm5, %xmm7 278 pminub %xmm6, %xmm7 279 pcmpeqd %xmm0, %xmm7 280 pmovmskb %xmm7, %eax 281 movaps %xmm5, %xmm7 282 palignr $4, %xmm4, %xmm5 283 palignr $4, %xmm3, %xmm4 284 test %eax, %eax 285 jnz L(Shl4Start) 286 287 palignr $4, %xmm2, %xmm3 288 addq $64, %rcx 289 palignr $4, %xmm1, %xmm2 290 movaps %xmm7, %xmm1 291 movaps %xmm5, 48(%rdx) 292 movaps %xmm4, 32(%rdx) 293 movaps %xmm3, 16(%rdx) 294 movaps %xmm2, (%rdx) 295 addq $64, %rdx 296 jmp L(Shl4LoopStart) 297 298L(Shl4LoopExit): 299 movdqu -4(%rcx), %xmm1 300 mov $12, %rsi 301 movdqu %xmm1, -4(%rdx) 302 jmp L(CopyFrom1To16Bytes) 303 304 .p2align 4 305L(Shl8): 306 movaps -8(%rcx), %xmm1 307 movaps 8(%rcx), %xmm2 308L(Shl8Start): 309 pcmpeqd %xmm2, %xmm0 310 pmovmskb %xmm0, %eax 311 movaps %xmm2, %xmm3 312 313 test %eax, %eax 314 jnz L(Shl8LoopExit) 315 316 palignr $8, %xmm1, %xmm2 317 movaps %xmm2, (%rdx) 318 movaps 24(%rcx), %xmm2 319 320 pcmpeqd %xmm2, %xmm0 321 addq $16, %rdx 322 pmovmskb %xmm0, %eax 323 addq $16, %rcx 324 movaps %xmm2, %xmm1 325 326 test %eax, %eax 327 jnz L(Shl8LoopExit) 328 329 palignr $8, %xmm3, %xmm2 330 movaps %xmm2, (%rdx) 331 movaps 24(%rcx), %xmm2 332 333 pcmpeqd %xmm2, %xmm0 334 addq $16, %rdx 335 pmovmskb %xmm0, %eax 336 addq $16, %rcx 337 movaps %xmm2, %xmm3 338 339 test %eax, %eax 340 jnz L(Shl8LoopExit) 341 342 palignr $8, %xmm1, %xmm2 343 movaps %xmm2, (%rdx) 344 movaps 24(%rcx), %xmm2 345 346 pcmpeqd %xmm2, %xmm0 347 addq $16, %rdx 348 pmovmskb %xmm0, %eax 349 addq $16, %rcx 350 351 test %eax, %eax 352 jnz L(Shl8LoopExit) 353 354 palignr $8, %xmm3, %xmm2 355 movaps %xmm2, (%rdx) 356 addq $24, %rcx 357 addq $16, %rdx 358 359 mov %rcx, %rax 360 and $-0x40, %rcx 361 sub %rcx, %rax 362 addq $-8, %rcx 363 sub %rax, %rdx 364 365 movaps -8(%rcx), %xmm1 366 367 .p2align 4 368L(Shl8LoopStart): 369 movaps 8(%rcx), %xmm2 370 movaps 24(%rcx), %xmm3 371 movaps %xmm3, %xmm6 372 movaps 40(%rcx), %xmm4 373 movaps %xmm4, %xmm7 374 movaps 56(%rcx), %xmm5 375 pminub %xmm2, %xmm6 376 pminub %xmm5, %xmm7 377 pminub %xmm6, %xmm7 378 pcmpeqd %xmm0, %xmm7 379 pmovmskb %xmm7, %eax 380 movaps %xmm5, %xmm7 381 palignr $8, %xmm4, %xmm5 382 palignr $8, %xmm3, %xmm4 383 test %eax, %eax 384 jnz L(Shl8Start) 385 386 palignr $8, %xmm2, %xmm3 387 addq $64, %rcx 388 palignr $8, %xmm1, %xmm2 389 movaps %xmm7, %xmm1 390 movaps %xmm5, 48(%rdx) 391 movaps %xmm4, 32(%rdx) 392 movaps %xmm3, 16(%rdx) 393 movaps %xmm2, (%rdx) 394 addq $64, %rdx 395 jmp L(Shl8LoopStart) 396 397L(Shl8LoopExit): 398 mov (%rcx), %r9 399 mov $8, %rsi 400 mov %r9, (%rdx) 401 jmp L(CopyFrom1To16Bytes) 402 403 .p2align 4 404L(Shl12): 405 movaps -12(%rcx), %xmm1 406 movaps 4(%rcx), %xmm2 407L(Shl12Start): 408 pcmpeqd %xmm2, %xmm0 409 pmovmskb %xmm0, %eax 410 movaps %xmm2, %xmm3 411 412 test %eax, %eax 413 jnz L(Shl12LoopExit) 414 415 palignr $12, %xmm1, %xmm2 416 movaps %xmm2, (%rdx) 417 movaps 20(%rcx), %xmm2 418 419 pcmpeqd %xmm2, %xmm0 420 addq $16, %rdx 421 pmovmskb %xmm0, %eax 422 addq $16, %rcx 423 movaps %xmm2, %xmm1 424 425 test %eax, %eax 426 jnz L(Shl12LoopExit) 427 428 palignr $12, %xmm3, %xmm2 429 movaps %xmm2, (%rdx) 430 movaps 20(%rcx), %xmm2 431 432 pcmpeqd %xmm2, %xmm0 433 addq $16, %rdx 434 pmovmskb %xmm0, %eax 435 addq $16, %rcx 436 movaps %xmm2, %xmm3 437 438 test %eax, %eax 439 jnz L(Shl12LoopExit) 440 441 palignr $12, %xmm1, %xmm2 442 movaps %xmm2, (%rdx) 443 movaps 20(%rcx), %xmm2 444 445 pcmpeqd %xmm2, %xmm0 446 addq $16, %rdx 447 pmovmskb %xmm0, %eax 448 addq $16, %rcx 449 450 test %eax, %eax 451 jnz L(Shl12LoopExit) 452 453 palignr $12, %xmm3, %xmm2 454 movaps %xmm2, (%rdx) 455 addq $20, %rcx 456 addq $16, %rdx 457 458 mov %rcx, %rax 459 and $-0x40, %rcx 460 sub %rcx, %rax 461 addq $-4, %rcx 462 sub %rax, %rdx 463 464 movaps -12(%rcx), %xmm1 465 466 .p2align 4 467L(Shl12LoopStart): 468 movaps 4(%rcx), %xmm2 469 movaps 20(%rcx), %xmm3 470 movaps %xmm3, %xmm6 471 movaps 36(%rcx), %xmm4 472 movaps %xmm4, %xmm7 473 movaps 52(%rcx), %xmm5 474 pminub %xmm2, %xmm6 475 pminub %xmm5, %xmm7 476 pminub %xmm6, %xmm7 477 pcmpeqd %xmm0, %xmm7 478 pmovmskb %xmm7, %eax 479 movaps %xmm5, %xmm7 480 palignr $12, %xmm4, %xmm5 481 palignr $12, %xmm3, %xmm4 482 test %eax, %eax 483 jnz L(Shl12Start) 484 palignr $12, %xmm2, %xmm3 485 addq $64, %rcx 486 palignr $12, %xmm1, %xmm2 487 movaps %xmm7, %xmm1 488 movaps %xmm5, 48(%rdx) 489 movaps %xmm4, 32(%rdx) 490 movaps %xmm3, 16(%rdx) 491 movaps %xmm2, (%rdx) 492 addq $64, %rdx 493 jmp L(Shl12LoopStart) 494 495L(Shl12LoopExit): 496 mov (%rcx), %r9d 497 mov $4, %rsi 498 mov %r9d, (%rdx) 499 jmp L(CopyFrom1To16Bytes) 500 501 .p2align 4 502L(CopyFrom1To16Bytes): 503 add %rsi, %rdx 504 add %rsi, %rcx 505 506 test %al, %al 507 jz L(ExitHigh) 508 test $0x01, %al 509 jnz L(Exit4) 510 511 mov (%rcx), %rax 512 mov %rax, (%rdx) 513 mov %rdi, %rax 514 ret 515 516 .p2align 4 517L(ExitHigh): 518 test $0x01, %ah 519 jnz L(Exit12) 520 521 mov (%rcx), %rax 522 mov %rax, (%rdx) 523 mov 8(%rcx), %rax 524 mov %rax, 8(%rdx) 525 mov %rdi, %rax 526 ret 527 528 .p2align 4 529L(Exit4): 530 movl (%rcx), %eax 531 movl %eax, (%rdx) 532 mov %rdi, %rax 533 ret 534 535 .p2align 4 536L(Exit8): 537 mov (%rcx), %rax 538 mov %rax, (%rdx) 539 mov %rdi, %rax 540 ret 541 542 .p2align 4 543L(Exit12): 544 mov (%rcx), %rax 545 mov %rax, (%rdx) 546 mov 8(%rcx), %eax 547 mov %eax, 8(%rdx) 548 mov %rdi, %rax 549 ret 550 551 .p2align 4 552L(Exit16): 553 mov (%rcx), %rax 554 mov %rax, (%rdx) 555 mov 8(%rcx), %rax 556 mov %rax, 8(%rdx) 557 mov %rdi, %rax 558 ret 559 560END(WCSCPY) 561#endif 562