1/* strcat with AVX2 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (3) 22 23 24# include <sysdep.h> 25 26# ifndef STRCAT 27# define STRCAT __strcat_avx2 28# endif 29 30# define USE_AS_STRCAT 31 32/* Number of bytes in a vector register */ 33# define VEC_SIZE 32 34 35# ifndef SECTION 36# define SECTION(p) p##.avx 37# endif 38 39 .section SECTION(.text),"ax",@progbits 40ENTRY (STRCAT) 41 mov %rdi, %r9 42# ifdef USE_AS_STRNCAT 43 mov %rdx, %r8 44# endif 45 46 xor %eax, %eax 47 mov %edi, %ecx 48 and $((VEC_SIZE * 4) - 1), %ecx 49 vpxor %xmm6, %xmm6, %xmm6 50 cmp $(VEC_SIZE * 3), %ecx 51 ja L(fourth_vector_boundary) 52 vpcmpeqb (%rdi), %ymm6, %ymm0 53 vpmovmskb %ymm0, %edx 54 test %edx, %edx 55 jnz L(exit_null_on_first_vector) 56 mov %rdi, %rax 57 and $-VEC_SIZE, %rax 58 jmp L(align_vec_size_start) 59L(fourth_vector_boundary): 60 mov %rdi, %rax 61 and $-VEC_SIZE, %rax 62 vpcmpeqb (%rax), %ymm6, %ymm0 63 mov $-1, %r10d 64 sub %rax, %rcx 65 shl %cl, %r10d 66 vpmovmskb %ymm0, %edx 67 and %r10d, %edx 68 jnz L(exit) 69 70L(align_vec_size_start): 71 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 72 vpmovmskb %ymm0, %edx 73 test %edx, %edx 74 jnz L(exit_null_on_second_vector) 75 76 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 77 vpmovmskb %ymm1, %edx 78 test %edx, %edx 79 jnz L(exit_null_on_third_vector) 80 81 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 82 vpmovmskb %ymm2, %edx 83 test %edx, %edx 84 jnz L(exit_null_on_fourth_vector) 85 86 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 87 vpmovmskb %ymm3, %edx 88 test %edx, %edx 89 jnz L(exit_null_on_fifth_vector) 90 91 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 92 add $(VEC_SIZE * 4), %rax 93 vpmovmskb %ymm0, %edx 94 test %edx, %edx 95 jnz L(exit_null_on_second_vector) 96 97 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 98 vpmovmskb %ymm1, %edx 99 test %edx, %edx 100 jnz L(exit_null_on_third_vector) 101 102 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 103 vpmovmskb %ymm2, %edx 104 test %edx, %edx 105 jnz L(exit_null_on_fourth_vector) 106 107 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 108 vpmovmskb %ymm3, %edx 109 test %edx, %edx 110 jnz L(exit_null_on_fifth_vector) 111 112 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 113 add $(VEC_SIZE * 4), %rax 114 vpmovmskb %ymm0, %edx 115 test %edx, %edx 116 jnz L(exit_null_on_second_vector) 117 118 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 119 vpmovmskb %ymm1, %edx 120 test %edx, %edx 121 jnz L(exit_null_on_third_vector) 122 123 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 124 vpmovmskb %ymm2, %edx 125 test %edx, %edx 126 jnz L(exit_null_on_fourth_vector) 127 128 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 129 vpmovmskb %ymm3, %edx 130 test %edx, %edx 131 jnz L(exit_null_on_fifth_vector) 132 133 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 134 add $(VEC_SIZE * 4), %rax 135 vpmovmskb %ymm0, %edx 136 test %edx, %edx 137 jnz L(exit_null_on_second_vector) 138 139 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 140 vpmovmskb %ymm1, %edx 141 test %edx, %edx 142 jnz L(exit_null_on_third_vector) 143 144 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 145 vpmovmskb %ymm2, %edx 146 test %edx, %edx 147 jnz L(exit_null_on_fourth_vector) 148 149 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 150 vpmovmskb %ymm3, %edx 151 test %edx, %edx 152 jnz L(exit_null_on_fifth_vector) 153 154 test $((VEC_SIZE * 4) - 1), %rax 155 jz L(align_four_vec_loop) 156 157 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 158 add $(VEC_SIZE * 5), %rax 159 vpmovmskb %ymm0, %edx 160 test %edx, %edx 161 jnz L(exit) 162 163 test $((VEC_SIZE * 4) - 1), %rax 164 jz L(align_four_vec_loop) 165 166 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 167 add $VEC_SIZE, %rax 168 vpmovmskb %ymm1, %edx 169 test %edx, %edx 170 jnz L(exit) 171 172 test $((VEC_SIZE * 4) - 1), %rax 173 jz L(align_four_vec_loop) 174 175 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 176 add $VEC_SIZE, %rax 177 vpmovmskb %ymm2, %edx 178 test %edx, %edx 179 jnz L(exit) 180 181 test $((VEC_SIZE * 4) - 1), %rax 182 jz L(align_four_vec_loop) 183 184 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 185 add $VEC_SIZE, %rax 186 vpmovmskb %ymm3, %edx 187 test %edx, %edx 188 jnz L(exit) 189 190 add $VEC_SIZE, %rax 191 192 .p2align 4 193L(align_four_vec_loop): 194 vmovaps (%rax), %ymm4 195 vpminub VEC_SIZE(%rax), %ymm4, %ymm4 196 vmovaps (VEC_SIZE * 2)(%rax), %ymm5 197 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 198 add $(VEC_SIZE * 4), %rax 199 vpminub %ymm4, %ymm5, %ymm5 200 vpcmpeqb %ymm5, %ymm6, %ymm5 201 vpmovmskb %ymm5, %edx 202 test %edx, %edx 203 jz L(align_four_vec_loop) 204 205 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 206 sub $(VEC_SIZE * 5), %rax 207 vpmovmskb %ymm0, %edx 208 test %edx, %edx 209 jnz L(exit_null_on_second_vector) 210 211 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 212 vpmovmskb %ymm1, %edx 213 test %edx, %edx 214 jnz L(exit_null_on_third_vector) 215 216 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 217 vpmovmskb %ymm2, %edx 218 test %edx, %edx 219 jnz L(exit_null_on_fourth_vector) 220 221 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 222 vpmovmskb %ymm3, %edx 223 sub %rdi, %rax 224 bsf %rdx, %rdx 225 add %rdx, %rax 226 add $(VEC_SIZE * 4), %rax 227 jmp L(StartStrcpyPart) 228 229 .p2align 4 230L(exit): 231 sub %rdi, %rax 232L(exit_null_on_first_vector): 233 bsf %rdx, %rdx 234 add %rdx, %rax 235 jmp L(StartStrcpyPart) 236 237 .p2align 4 238L(exit_null_on_second_vector): 239 sub %rdi, %rax 240 bsf %rdx, %rdx 241 add %rdx, %rax 242 add $VEC_SIZE, %rax 243 jmp L(StartStrcpyPart) 244 245 .p2align 4 246L(exit_null_on_third_vector): 247 sub %rdi, %rax 248 bsf %rdx, %rdx 249 add %rdx, %rax 250 add $(VEC_SIZE * 2), %rax 251 jmp L(StartStrcpyPart) 252 253 .p2align 4 254L(exit_null_on_fourth_vector): 255 sub %rdi, %rax 256 bsf %rdx, %rdx 257 add %rdx, %rax 258 add $(VEC_SIZE * 3), %rax 259 jmp L(StartStrcpyPart) 260 261 .p2align 4 262L(exit_null_on_fifth_vector): 263 sub %rdi, %rax 264 bsf %rdx, %rdx 265 add %rdx, %rax 266 add $(VEC_SIZE * 4), %rax 267 268 .p2align 4 269L(StartStrcpyPart): 270 lea (%r9, %rax), %rdi 271 mov %rsi, %rcx 272 mov %r9, %rax /* save result */ 273 274# ifdef USE_AS_STRNCAT 275 test %r8, %r8 276 jz L(ExitZero) 277# define USE_AS_STRNCPY 278# endif 279 280# include "strcpy-avx2.S" 281#endif 282