1/* memset with unaligned store and rep stosb 2 Copyright (C) 2016-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* memset is implemented as: 20 1. Use overlapping store to avoid branch. 21 2. If size is less than VEC, use integer register stores. 22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. 23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. 24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with 25 4 VEC stores and store 4 * VEC at a time until done. */ 26 27#include <sysdep.h> 28 29#ifndef MEMSET_CHK_SYMBOL 30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) 31#endif 32 33#ifndef WMEMSET_CHK_SYMBOL 34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) 35#endif 36 37#ifndef XMM0 38# define XMM0 xmm0 39#endif 40 41#ifndef YMM0 42# define YMM0 ymm0 43#endif 44 45#ifndef VZEROUPPER 46# if VEC_SIZE > 16 47# define VZEROUPPER vzeroupper 48# define VZEROUPPER_SHORT_RETURN vzeroupper; ret 49# else 50# define VZEROUPPER 51# endif 52#endif 53 54#ifndef VZEROUPPER_SHORT_RETURN 55# define VZEROUPPER_SHORT_RETURN rep; ret 56#endif 57 58#ifndef MOVQ 59# if VEC_SIZE > 16 60# define MOVQ vmovq 61# define MOVD vmovd 62# else 63# define MOVQ movq 64# define MOVD movd 65# endif 66#endif 67 68#if VEC_SIZE == 64 69# define LOOP_4X_OFFSET (VEC_SIZE * 4) 70#else 71# define LOOP_4X_OFFSET (0) 72#endif 73 74#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 75# define END_REG rcx 76# define LOOP_REG rdi 77# define LESS_VEC_REG rax 78#else 79# define END_REG rdi 80# define LOOP_REG rdx 81# define LESS_VEC_REG rdi 82#endif 83 84#ifdef USE_XMM_LESS_VEC 85# define XMM_SMALL 1 86#else 87# define XMM_SMALL 0 88#endif 89 90#ifdef USE_LESS_VEC_MASK_STORE 91# define SET_REG64 rcx 92# define SET_REG32 ecx 93# define SET_REG16 cx 94# define SET_REG8 cl 95#else 96# define SET_REG64 rsi 97# define SET_REG32 esi 98# define SET_REG16 si 99# define SET_REG8 sil 100#endif 101 102#define PAGE_SIZE 4096 103 104/* Macro to calculate size of small memset block for aligning 105 purposes. */ 106#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1) 107 108 109#ifndef SECTION 110# error SECTION is not defined! 111#endif 112 113 .section SECTION(.text), "ax", @progbits 114#if IS_IN (libc) 115# if defined SHARED 116ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) 117 cmp %RDX_LP, %RCX_LP 118 jb HIDDEN_JUMPTARGET (__chk_fail) 119END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) 120# endif 121 122ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) 123 shl $2, %RDX_LP 124 WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) 125 WMEMSET_VDUP_TO_VEC0_LOW() 126 cmpq $VEC_SIZE, %rdx 127 jb L(less_vec_from_wmemset) 128 WMEMSET_VDUP_TO_VEC0_HIGH() 129 jmp L(entry_from_wmemset) 130END (WMEMSET_SYMBOL (__wmemset, unaligned)) 131#endif 132 133#if defined SHARED && IS_IN (libc) 134ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) 135 cmp %RDX_LP, %RCX_LP 136 jb HIDDEN_JUMPTARGET (__chk_fail) 137END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) 138#endif 139 140ENTRY (MEMSET_SYMBOL (__memset, unaligned)) 141 MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) 142# ifdef __ILP32__ 143 /* Clear the upper 32 bits. */ 144 mov %edx, %edx 145# endif 146 cmpq $VEC_SIZE, %rdx 147 jb L(less_vec) 148 MEMSET_VDUP_TO_VEC0_HIGH() 149L(entry_from_wmemset): 150 cmpq $(VEC_SIZE * 2), %rdx 151 ja L(more_2x_vec) 152 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ 153 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) 154 VMOVU %VEC(0), (%rdi) 155 VZEROUPPER_RETURN 156#if defined USE_MULTIARCH && IS_IN (libc) 157END (MEMSET_SYMBOL (__memset, unaligned)) 158 159# if defined SHARED && IS_IN (libc) 160ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) 161 cmp %RDX_LP, %RCX_LP 162 jb HIDDEN_JUMPTARGET (__chk_fail) 163END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) 164# endif 165 166ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) 167 MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) 168# ifdef __ILP32__ 169 /* Clear the upper 32 bits. */ 170 mov %edx, %edx 171# endif 172 cmp $VEC_SIZE, %RDX_LP 173 jb L(less_vec) 174 MEMSET_VDUP_TO_VEC0_HIGH () 175 cmp $(VEC_SIZE * 2), %RDX_LP 176 ja L(stosb_more_2x_vec) 177 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ 178 VMOVU %VEC(0), (%rdi) 179 VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) 180 VZEROUPPER_RETURN 181#endif 182 183 .p2align 4,, 4 184L(last_2x_vec): 185#ifdef USE_LESS_VEC_MASK_STORE 186 VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) 187 VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) 188#else 189 VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) 190 VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) 191#endif 192 VZEROUPPER_RETURN 193 194 /* If have AVX512 mask instructions put L(less_vec) close to 195 entry as it doesn't take much space and is likely a hot target. 196 */ 197#ifdef USE_LESS_VEC_MASK_STORE 198 .p2align 4,, 10 199L(less_vec): 200L(less_vec_from_wmemset): 201 /* Less than 1 VEC. */ 202# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 203# error Unsupported VEC_SIZE! 204# endif 205 /* Clear high bits from edi. Only keeping bits relevant to page 206 cross check. Note that we are using rax which is set in 207 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ 208 andl $(PAGE_SIZE - 1), %edi 209 /* Check if VEC_SIZE store cross page. Mask stores suffer 210 serious performance degradation when it has to fault supress. 211 */ 212 cmpl $(PAGE_SIZE - VEC_SIZE), %edi 213 /* This is generally considered a cold target. */ 214 ja L(cross_page) 215# if VEC_SIZE > 32 216 movq $-1, %rcx 217 bzhiq %rdx, %rcx, %rcx 218 kmovq %rcx, %k1 219# else 220 movl $-1, %ecx 221 bzhil %edx, %ecx, %ecx 222 kmovd %ecx, %k1 223# endif 224 vmovdqu8 %VEC(0), (%rax){%k1} 225 VZEROUPPER_RETURN 226 227# if defined USE_MULTIARCH && IS_IN (libc) 228 /* Include L(stosb_local) here if including L(less_vec) between 229 L(stosb_more_2x_vec) and ENTRY. This is to cache align the 230 L(stosb_more_2x_vec) target. */ 231 .p2align 4,, 10 232L(stosb_local): 233 movzbl %sil, %eax 234 mov %RDX_LP, %RCX_LP 235 mov %RDI_LP, %RDX_LP 236 rep stosb 237 mov %RDX_LP, %RAX_LP 238 VZEROUPPER_RETURN 239# endif 240#endif 241 242#if defined USE_MULTIARCH && IS_IN (libc) 243 .p2align 4 244L(stosb_more_2x_vec): 245 cmp __x86_rep_stosb_threshold(%rip), %RDX_LP 246 ja L(stosb_local) 247#endif 248 /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] 249 and (4x, 8x] jump to target. */ 250L(more_2x_vec): 251 /* Store next 2x vec regardless. */ 252 VMOVU %VEC(0), (%rdi) 253 VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) 254 255 256 /* Two different methods of setting up pointers / compare. The two 257 methods are based on the fact that EVEX/AVX512 mov instructions take 258 more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 259 machines also have fast LEA_BID. Both setup and END_REG to avoid complex 260 address mode. For EVEX/AVX512 this saves code size and keeps a few 261 targets in one fetch block. For AVX2/SSE2 this helps prevent AGU 262 bottlenecks. */ 263#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) 264 /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ 265 addq %rdx, %END_REG 266#endif 267 268 cmpq $(VEC_SIZE * 4), %rdx 269 jbe L(last_2x_vec) 270 271 272#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 273 /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with 274 LEA_BID. */ 275 276 /* END_REG is rcx for EVEX/AVX512. */ 277 leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG 278#endif 279 280 /* Store next 2x vec regardless. */ 281 VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) 282 VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) 283 284 285#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 286 /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add 287 extra offset to addresses in loop. Used for AVX512 to save space 288 as no way to get (VEC_SIZE * 4) in imm8. */ 289# if LOOP_4X_OFFSET == 0 290 subq $-(VEC_SIZE * 4), %LOOP_REG 291# endif 292 /* Avoid imm32 compare here to save code size. */ 293 cmpq %rdi, %rcx 294#else 295 addq $-(VEC_SIZE * 4), %END_REG 296 cmpq $(VEC_SIZE * 8), %rdx 297#endif 298 jbe L(last_4x_vec) 299#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) 300 /* Set LOOP_REG (rdx). */ 301 leaq (VEC_SIZE * 4)(%rax), %LOOP_REG 302#endif 303 /* Align dst for loop. */ 304 andq $(VEC_SIZE * -2), %LOOP_REG 305 .p2align 4 306L(loop): 307 VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) 308 VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) 309 VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) 310 VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) 311 subq $-(VEC_SIZE * 4), %LOOP_REG 312 cmpq %END_REG, %LOOP_REG 313 jb L(loop) 314 .p2align 4,, MOV_SIZE 315L(last_4x_vec): 316 VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) 317 VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) 318 VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) 319 VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) 320L(return): 321#if VEC_SIZE > 16 322 ZERO_UPPER_VEC_REGISTERS_RETURN 323#else 324 ret 325#endif 326 327 .p2align 4,, 10 328#ifndef USE_LESS_VEC_MASK_STORE 329# if defined USE_MULTIARCH && IS_IN (libc) 330 /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in 331 range for 2-byte jump encoding. */ 332L(stosb_local): 333 movzbl %sil, %eax 334 mov %RDX_LP, %RCX_LP 335 mov %RDI_LP, %RDX_LP 336 rep stosb 337 mov %RDX_LP, %RAX_LP 338 VZEROUPPER_RETURN 339# endif 340 /* Define L(less_vec) only if not otherwise defined. */ 341 .p2align 4 342L(less_vec): 343 /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to 344 xmm). This is only does anything for AVX2. */ 345 MEMSET_VDUP_TO_VEC0_LOW () 346L(less_vec_from_wmemset): 347#endif 348L(cross_page): 349#if VEC_SIZE > 32 350 cmpl $32, %edx 351 jge L(between_32_63) 352#endif 353#if VEC_SIZE > 16 354 cmpl $16, %edx 355 jge L(between_16_31) 356#endif 357#ifndef USE_XMM_LESS_VEC 358 MOVQ %XMM0, %SET_REG64 359#endif 360 cmpl $8, %edx 361 jge L(between_8_15) 362 cmpl $4, %edx 363 jge L(between_4_7) 364 cmpl $1, %edx 365 jg L(between_2_3) 366 jl L(between_0_0) 367 movb %SET_REG8, (%LESS_VEC_REG) 368L(between_0_0): 369 ret 370 371 /* Align small targets only if not doing so would cross a fetch line. 372 */ 373#if VEC_SIZE > 32 374 .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) 375 /* From 32 to 63. No branch when size == 32. */ 376L(between_32_63): 377 VMOVU %YMM0, (%LESS_VEC_REG) 378 VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) 379 VZEROUPPER_RETURN 380#endif 381 382#if VEC_SIZE >= 32 383 .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) 384L(between_16_31): 385 /* From 16 to 31. No branch when size == 16. */ 386 VMOVU %XMM0, (%LESS_VEC_REG) 387 VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) 388 ret 389#endif 390 391 /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. 392 */ 393 .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) 394L(between_8_15): 395 /* From 8 to 15. No branch when size == 8. */ 396#ifdef USE_XMM_LESS_VEC 397 MOVQ %XMM0, (%rdi) 398 MOVQ %XMM0, -8(%rdi, %rdx) 399#else 400 movq %SET_REG64, (%LESS_VEC_REG) 401 movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) 402#endif 403 ret 404 405 /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. 406 */ 407 .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) 408L(between_4_7): 409 /* From 4 to 7. No branch when size == 4. */ 410#ifdef USE_XMM_LESS_VEC 411 MOVD %XMM0, (%rdi) 412 MOVD %XMM0, -4(%rdi, %rdx) 413#else 414 movl %SET_REG32, (%LESS_VEC_REG) 415 movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) 416#endif 417 ret 418 419 /* 4 * XMM_SMALL for the third mov for AVX2. */ 420 .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) 421L(between_2_3): 422 /* From 2 to 3. No branch when size == 2. */ 423#ifdef USE_XMM_LESS_VEC 424 movb %SET_REG8, (%rdi) 425 movb %SET_REG8, 1(%rdi) 426 movb %SET_REG8, -1(%rdi, %rdx) 427#else 428 movw %SET_REG16, (%LESS_VEC_REG) 429 movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) 430#endif 431 ret 432END (MEMSET_SYMBOL (__memset, unaligned_erms)) 433