1/* Optimized memset for Fujitsu A64FX processor. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library. If not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21#include <sysdeps/aarch64/memset-reg.h> 22 23/* Assumptions: 24 * 25 * ARMv8.2-a, AArch64, unaligned accesses, sve 26 * 27 */ 28 29#define L1_SIZE (64*1024) // L1 64KB 30#define L2_SIZE (8*1024*1024) // L2 8MB 31#define CACHE_LINE_SIZE 256 32#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 33#define vector_length x9 34 35#if HAVE_AARCH64_SVE_ASM 36# if IS_IN (libc) 37# define MEMSET __memset_a64fx 38 39 .arch armv8.2-a+sve 40 41 .macro st1b_unroll first=0, last=7 42 st1b z0.b, p0, [dst, \first, mul vl] 43 .if \last-\first 44 st1b_unroll "(\first+1)", \last 45 .endif 46 .endm 47 48 49#undef BTI_C 50#define BTI_C 51 52ENTRY (MEMSET) 53 PTR_ARG (0) 54 SIZE_ARG (2) 55 56 cntb vector_length 57 dup z0.b, valw 58 whilelo p0.b, vector_length, count 59 b.last 1f 60 whilelo p1.b, xzr, count 61 st1b z0.b, p1, [dstin, 0, mul vl] 62 st1b z0.b, p0, [dstin, 1, mul vl] 63 ret 64 65 // count >= vector_length * 2 661: cmp count, vector_length, lsl 2 67 add dstend, dstin, count 68 b.hi 1f 69 st1b z0.b, p0, [dstin, 0, mul vl] 70 st1b z0.b, p0, [dstin, 1, mul vl] 71 st1b z0.b, p0, [dstend, -2, mul vl] 72 st1b z0.b, p0, [dstend, -1, mul vl] 73 ret 74 75 // count > vector_length * 4 761: lsl tmp1, vector_length, 3 77 cmp count, tmp1 78 b.hi L(vl_agnostic) 79 st1b z0.b, p0, [dstin, 0, mul vl] 80 st1b z0.b, p0, [dstin, 1, mul vl] 81 st1b z0.b, p0, [dstin, 2, mul vl] 82 st1b z0.b, p0, [dstin, 3, mul vl] 83 st1b z0.b, p0, [dstend, -4, mul vl] 84 st1b z0.b, p0, [dstend, -3, mul vl] 85 st1b z0.b, p0, [dstend, -2, mul vl] 86 st1b z0.b, p0, [dstend, -1, mul vl] 87 ret 88 89 .p2align 4 90L(vl_agnostic): // VL Agnostic 91 mov dst, dstin 92 cmp count, L1_SIZE 93 b.hi L(L1_prefetch) 94 95 // count >= 8 * vector_length 96L(unroll8): 97 sub count, count, tmp1 98 .p2align 4 99 // The 2 instructions at the beginning of the following loop, 100 // cmp and branch, are a workaround so as not to degrade at 101 // the peak performance 16KB. 102 // It is found heuristically and the branch condition, b.ne, 103 // is chosen intentionally never to jump. 1041: cmp xzr, xzr 105 b.ne 1b 106 st1b_unroll 0, 7 107 add dst, dst, tmp1 108 subs count, count, tmp1 109 b.hi 1b 110 add count, count, tmp1 111 112L(last): 113 cmp count, vector_length, lsl 1 114 b.ls 2f 115 add tmp2, vector_length, vector_length, lsl 2 116 cmp count, tmp2 117 b.ls 5f 118 st1b z0.b, p0, [dstend, -8, mul vl] 119 st1b z0.b, p0, [dstend, -7, mul vl] 120 st1b z0.b, p0, [dstend, -6, mul vl] 1215: st1b z0.b, p0, [dstend, -5, mul vl] 122 st1b z0.b, p0, [dstend, -4, mul vl] 123 st1b z0.b, p0, [dstend, -3, mul vl] 1242: st1b z0.b, p0, [dstend, -2, mul vl] 125 st1b z0.b, p0, [dstend, -1, mul vl] 126 ret 127 128 // count >= L1_SIZE 129 .p2align 3 130L(L1_prefetch): 131 cmp count, L2_SIZE 132 b.hs L(L2) 133 cmp vector_length, 64 134 b.ne L(unroll8) 1351: st1b_unroll 0, 3 136 prfm pstl1keep, [dst, PF_DIST_L1] 137 st1b_unroll 4, 7 138 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] 139 add dst, dst, CACHE_LINE_SIZE * 2 140 sub count, count, CACHE_LINE_SIZE * 2 141 cmp count, PF_DIST_L1 142 b.hs 1b 143 b L(unroll8) 144 145 // count >= L2_SIZE 146 .p2align 3 147L(L2): 148 tst valw, 255 149 b.ne L(unroll8) 150 // align dst to CACHE_LINE_SIZE byte boundary 151 and tmp2, dst, CACHE_LINE_SIZE - 1 152 st1b z0.b, p0, [dst, 0, mul vl] 153 st1b z0.b, p0, [dst, 1, mul vl] 154 st1b z0.b, p0, [dst, 2, mul vl] 155 st1b z0.b, p0, [dst, 3, mul vl] 156 sub dst, dst, tmp2 157 add count, count, tmp2 158 159 // clear cachelines using DC ZVA 160 sub count, count, CACHE_LINE_SIZE * 2 161 .p2align 4 1621: add dst, dst, CACHE_LINE_SIZE 163 dc zva, dst 164 subs count, count, CACHE_LINE_SIZE 165 b.hi 1b 166 add count, count, CACHE_LINE_SIZE 167 b L(last) 168 169END (MEMSET) 170libc_hidden_builtin_def (MEMSET) 171 172#endif /* IS_IN (libc) */ 173#endif /* HAVE_AARCH64_SVE_ASM */ 174