1/* memset optimized with AVX512 for KNL hardware. 2 Copyright (C) 2015-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <isa-level.h> 21 22#if ISA_SHOULD_BUILD (4) 23 24 25#include "asm-syntax.h" 26#ifndef MEMSET 27# define MEMSET __memset_avx512_no_vzeroupper 28# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper 29#endif 30 31 .section .text.avx512,"ax",@progbits 32#if defined PIC 33ENTRY (MEMSET_CHK) 34 cmp %RDX_LP, %RCX_LP 35 jb HIDDEN_JUMPTARGET (__chk_fail) 36END (MEMSET_CHK) 37#endif 38 39ENTRY (MEMSET) 40# ifdef __ILP32__ 41 /* Clear the upper 32 bits. */ 42 mov %edx, %edx 43# endif 44 vpxor %xmm0, %xmm0, %xmm0 45 vmovd %esi, %xmm1 46 lea (%rdi, %rdx), %rsi 47 mov %rdi, %rax 48 vpshufb %xmm0, %xmm1, %xmm0 49 cmp $16, %rdx 50 jb L(less_16bytes) 51 cmp $512, %rdx 52 vbroadcastss %xmm0, %zmm2 53 ja L(512bytesormore) 54 cmp $256, %rdx 55 jb L(less_256bytes) 56 vmovups %zmm2, (%rdi) 57 vmovups %zmm2, 0x40(%rdi) 58 vmovups %zmm2, 0x80(%rdi) 59 vmovups %zmm2, 0xC0(%rdi) 60 vmovups %zmm2, -0x100(%rsi) 61 vmovups %zmm2, -0xC0(%rsi) 62 vmovups %zmm2, -0x80(%rsi) 63 vmovups %zmm2, -0x40(%rsi) 64 ret 65 66L(less_256bytes): 67 cmp $128, %dl 68 jb L(less_128bytes) 69 vmovups %zmm2, (%rdi) 70 vmovups %zmm2, 0x40(%rdi) 71 vmovups %zmm2, -0x80(%rsi) 72 vmovups %zmm2, -0x40(%rsi) 73 ret 74 75L(less_128bytes): 76 cmp $64, %dl 77 jb L(less_64bytes) 78 vmovups %zmm2, (%rdi) 79 vmovups %zmm2, -0x40(%rsi) 80 ret 81 82L(less_64bytes): 83 cmp $32, %dl 84 jb L(less_32bytes) 85 vmovdqu %ymm2, (%rdi) 86 vmovdqu %ymm2, -0x20(%rsi) 87 ret 88 89L(less_32bytes): 90 vmovdqu %xmm0, (%rdi) 91 vmovdqu %xmm0, -0x10(%rsi) 92 ret 93 94L(less_16bytes): 95 cmp $8, %dl 96 jb L(less_8bytes) 97 vmovq %xmm0, (%rdi) 98 vmovq %xmm0, -0x08(%rsi) 99 ret 100 101L(less_8bytes): 102 vmovd %xmm0, %ecx 103 cmp $4, %dl 104 jb L(less_4bytes) 105 mov %ecx, (%rdi) 106 mov %ecx, -0x04(%rsi) 107 ret 108 109L(less_4bytes): 110 cmp $2, %dl 111 jb L(less_2bytes) 112 mov %cx, (%rdi) 113 mov %cx, -0x02(%rsi) 114 ret 115 116L(less_2bytes): 117 cmp $1, %dl 118 jb L(less_1bytes) 119 mov %cl, (%rdi) 120L(less_1bytes): 121 ret 122 123L(512bytesormore): 124 mov __x86_shared_cache_size_half(%rip), %rcx 125 cmp %rcx, %rdx 126 ja L(preloop_large) 127 cmp $1024, %rdx 128 ja L(1024bytesormore) 129 130 vmovups %zmm2, (%rdi) 131 vmovups %zmm2, 0x40(%rdi) 132 vmovups %zmm2, 0x80(%rdi) 133 vmovups %zmm2, 0xC0(%rdi) 134 vmovups %zmm2, 0x100(%rdi) 135 vmovups %zmm2, 0x140(%rdi) 136 vmovups %zmm2, 0x180(%rdi) 137 vmovups %zmm2, 0x1C0(%rdi) 138 vmovups %zmm2, -0x200(%rsi) 139 vmovups %zmm2, -0x1C0(%rsi) 140 vmovups %zmm2, -0x180(%rsi) 141 vmovups %zmm2, -0x140(%rsi) 142 vmovups %zmm2, -0x100(%rsi) 143 vmovups %zmm2, -0xC0(%rsi) 144 vmovups %zmm2, -0x80(%rsi) 145 vmovups %zmm2, -0x40(%rsi) 146 ret 147 148/* Align on 64 and loop with aligned stores. */ 149L(1024bytesormore): 150 sub $0x100, %rsi 151 vmovups %zmm2, (%rax) 152 and $-0x40, %rdi 153 add $0x40, %rdi 154 155L(gobble_256bytes_loop): 156 vmovaps %zmm2, (%rdi) 157 vmovaps %zmm2, 0x40(%rdi) 158 vmovaps %zmm2, 0x80(%rdi) 159 vmovaps %zmm2, 0xC0(%rdi) 160 add $0x100, %rdi 161 cmp %rsi, %rdi 162 jb L(gobble_256bytes_loop) 163 vmovups %zmm2, (%rsi) 164 vmovups %zmm2, 0x40(%rsi) 165 vmovups %zmm2, 0x80(%rsi) 166 vmovups %zmm2, 0xC0(%rsi) 167 ret 168 169/* Align on 128 and loop with non-temporal stores. */ 170L(preloop_large): 171 and $-0x80, %rdi 172 add $0x80, %rdi 173 vmovups %zmm2, (%rax) 174 vmovups %zmm2, 0x40(%rax) 175 sub $0x200, %rsi 176 177L(gobble_512bytes_nt_loop): 178 vmovntdq %zmm2, (%rdi) 179 vmovntdq %zmm2, 0x40(%rdi) 180 vmovntdq %zmm2, 0x80(%rdi) 181 vmovntdq %zmm2, 0xC0(%rdi) 182 vmovntdq %zmm2, 0x100(%rdi) 183 vmovntdq %zmm2, 0x140(%rdi) 184 vmovntdq %zmm2, 0x180(%rdi) 185 vmovntdq %zmm2, 0x1C0(%rdi) 186 add $0x200, %rdi 187 cmp %rsi, %rdi 188 jb L(gobble_512bytes_nt_loop) 189 sfence 190 vmovups %zmm2, (%rsi) 191 vmovups %zmm2, 0x40(%rsi) 192 vmovups %zmm2, 0x80(%rsi) 193 vmovups %zmm2, 0xC0(%rsi) 194 vmovups %zmm2, 0x100(%rsi) 195 vmovups %zmm2, 0x140(%rsi) 196 vmovups %zmm2, 0x180(%rsi) 197 vmovups %zmm2, 0x1C0(%rsi) 198 ret 199END (MEMSET) 200#endif 201