1/* strcat with SSE2 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation 22 so we need this to build for ISA V2 builds. */ 23#if ISA_SHOULD_BUILD (2) 24 25 26# include <sysdep.h> 27 28# ifndef STRCAT 29# define STRCAT __strcat_sse2_unaligned 30# endif 31 32# define USE_AS_STRCAT 33 34.text 35ENTRY (STRCAT) 36 mov %rdi, %r9 37# ifdef USE_AS_STRNCAT 38 mov %rdx, %r8 39# endif 40 41/* Inline corresponding strlen file, temporary until new strcpy 42 implementation gets merged. */ 43 44 xor %rax, %rax 45 mov %edi, %ecx 46 and $0x3f, %ecx 47 pxor %xmm0, %xmm0 48 cmp $0x30, %ecx 49 ja L(next) 50 movdqu (%rdi), %xmm1 51 pcmpeqb %xmm1, %xmm0 52 pmovmskb %xmm0, %edx 53 test %edx, %edx 54 jnz L(exit_less16) 55 mov %rdi, %rax 56 and $-16, %rax 57 jmp L(align16_start) 58L(next): 59 mov %rdi, %rax 60 and $-16, %rax 61 pcmpeqb (%rax), %xmm0 62 mov $-1, %r10d 63 sub %rax, %rcx 64 shl %cl, %r10d 65 pmovmskb %xmm0, %edx 66 and %r10d, %edx 67 jnz L(exit) 68 69L(align16_start): 70 pxor %xmm0, %xmm0 71 pxor %xmm1, %xmm1 72 pxor %xmm2, %xmm2 73 pxor %xmm3, %xmm3 74 pcmpeqb 16(%rax), %xmm0 75 pmovmskb %xmm0, %edx 76 test %edx, %edx 77 jnz L(exit16) 78 79 pcmpeqb 32(%rax), %xmm1 80 pmovmskb %xmm1, %edx 81 test %edx, %edx 82 jnz L(exit32) 83 84 pcmpeqb 48(%rax), %xmm2 85 pmovmskb %xmm2, %edx 86 test %edx, %edx 87 jnz L(exit48) 88 89 pcmpeqb 64(%rax), %xmm3 90 pmovmskb %xmm3, %edx 91 test %edx, %edx 92 jnz L(exit64) 93 94 pcmpeqb 80(%rax), %xmm0 95 add $64, %rax 96 pmovmskb %xmm0, %edx 97 test %edx, %edx 98 jnz L(exit16) 99 100 pcmpeqb 32(%rax), %xmm1 101 pmovmskb %xmm1, %edx 102 test %edx, %edx 103 jnz L(exit32) 104 105 pcmpeqb 48(%rax), %xmm2 106 pmovmskb %xmm2, %edx 107 test %edx, %edx 108 jnz L(exit48) 109 110 pcmpeqb 64(%rax), %xmm3 111 pmovmskb %xmm3, %edx 112 test %edx, %edx 113 jnz L(exit64) 114 115 pcmpeqb 80(%rax), %xmm0 116 add $64, %rax 117 pmovmskb %xmm0, %edx 118 test %edx, %edx 119 jnz L(exit16) 120 121 pcmpeqb 32(%rax), %xmm1 122 pmovmskb %xmm1, %edx 123 test %edx, %edx 124 jnz L(exit32) 125 126 pcmpeqb 48(%rax), %xmm2 127 pmovmskb %xmm2, %edx 128 test %edx, %edx 129 jnz L(exit48) 130 131 pcmpeqb 64(%rax), %xmm3 132 pmovmskb %xmm3, %edx 133 test %edx, %edx 134 jnz L(exit64) 135 136 pcmpeqb 80(%rax), %xmm0 137 add $64, %rax 138 pmovmskb %xmm0, %edx 139 test %edx, %edx 140 jnz L(exit16) 141 142 pcmpeqb 32(%rax), %xmm1 143 pmovmskb %xmm1, %edx 144 test %edx, %edx 145 jnz L(exit32) 146 147 pcmpeqb 48(%rax), %xmm2 148 pmovmskb %xmm2, %edx 149 test %edx, %edx 150 jnz L(exit48) 151 152 pcmpeqb 64(%rax), %xmm3 153 pmovmskb %xmm3, %edx 154 test %edx, %edx 155 jnz L(exit64) 156 157 test $0x3f, %rax 158 jz L(align64_loop) 159 160 pcmpeqb 80(%rax), %xmm0 161 add $80, %rax 162 pmovmskb %xmm0, %edx 163 test %edx, %edx 164 jnz L(exit) 165 166 test $0x3f, %rax 167 jz L(align64_loop) 168 169 pcmpeqb 16(%rax), %xmm1 170 add $16, %rax 171 pmovmskb %xmm1, %edx 172 test %edx, %edx 173 jnz L(exit) 174 175 test $0x3f, %rax 176 jz L(align64_loop) 177 178 pcmpeqb 16(%rax), %xmm2 179 add $16, %rax 180 pmovmskb %xmm2, %edx 181 test %edx, %edx 182 jnz L(exit) 183 184 test $0x3f, %rax 185 jz L(align64_loop) 186 187 pcmpeqb 16(%rax), %xmm3 188 add $16, %rax 189 pmovmskb %xmm3, %edx 190 test %edx, %edx 191 jnz L(exit) 192 193 add $16, %rax 194 .p2align 4 195 L(align64_loop): 196 movaps (%rax), %xmm4 197 pminub 16(%rax), %xmm4 198 movaps 32(%rax), %xmm5 199 pminub 48(%rax), %xmm5 200 add $64, %rax 201 pminub %xmm4, %xmm5 202 pcmpeqb %xmm0, %xmm5 203 pmovmskb %xmm5, %edx 204 test %edx, %edx 205 jz L(align64_loop) 206 207 pcmpeqb -64(%rax), %xmm0 208 sub $80, %rax 209 pmovmskb %xmm0, %edx 210 test %edx, %edx 211 jnz L(exit16) 212 213 pcmpeqb 32(%rax), %xmm1 214 pmovmskb %xmm1, %edx 215 test %edx, %edx 216 jnz L(exit32) 217 218 pcmpeqb 48(%rax), %xmm2 219 pmovmskb %xmm2, %edx 220 test %edx, %edx 221 jnz L(exit48) 222 223 pcmpeqb 64(%rax), %xmm3 224 pmovmskb %xmm3, %edx 225 sub %rdi, %rax 226 bsf %rdx, %rdx 227 add %rdx, %rax 228 add $64, %rax 229 jmp L(StartStrcpyPart) 230 231 .p2align 4 232L(exit): 233 sub %rdi, %rax 234L(exit_less16): 235 bsf %rdx, %rdx 236 add %rdx, %rax 237 jmp L(StartStrcpyPart) 238 239 .p2align 4 240L(exit16): 241 sub %rdi, %rax 242 bsf %rdx, %rdx 243 add %rdx, %rax 244 add $16, %rax 245 jmp L(StartStrcpyPart) 246 247 .p2align 4 248L(exit32): 249 sub %rdi, %rax 250 bsf %rdx, %rdx 251 add %rdx, %rax 252 add $32, %rax 253 jmp L(StartStrcpyPart) 254 255 .p2align 4 256L(exit48): 257 sub %rdi, %rax 258 bsf %rdx, %rdx 259 add %rdx, %rax 260 add $48, %rax 261 jmp L(StartStrcpyPart) 262 263 .p2align 4 264L(exit64): 265 sub %rdi, %rax 266 bsf %rdx, %rdx 267 add %rdx, %rax 268 add $64, %rax 269 270 .p2align 4 271L(StartStrcpyPart): 272 lea (%r9, %rax), %rdi 273 mov %rsi, %rcx 274 mov %r9, %rax /* save result */ 275 276# ifdef USE_AS_STRNCAT 277 test %r8, %r8 278 jz L(ExitZero) 279# define USE_AS_STRNCPY 280# endif 281 282# include "strcpy-sse2-unaligned.S" 283#endif 284