1/* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara. 2 Copyright (C) 2006-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 22#define ASI_P 0x80 23#define ASI_PNF 0x82 24 25#define LOAD(type,addr,dest) type##a [addr] ASI_P, dest 26#define LOAD_TWIN(addr_reg,dest0,dest1) \ 27 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 28 29#define STORE(type,src,addr) type src, [addr] 30#define STORE_INIT(src,addr) stxa src, [addr] %asi 31 32#ifndef XCC 33#define USE_BPR 34#define XCC xcc 35#endif 36 37#if IS_IN (libc) 38 39 .register %g2,#scratch 40 .register %g3,#scratch 41 .register %g6,#scratch 42 43 .text 44 45ENTRY(__mempcpy_niagara1) 46 ba,pt %XCC, 101f 47 add %o0, %o2, %g5 48END(__mempcpy_niagara1) 49 50 .align 32 51ENTRY(__memcpy_niagara1) 52100: /* %o0=dst, %o1=src, %o2=len */ 53 mov %o0, %g5 54101: 55# ifndef USE_BPR 56 srl %o2, 0, %o2 57# endif 58 cmp %o2, 0 59 be,pn %XCC, 85f 60218: or %o0, %o1, %o3 61 cmp %o2, 16 62 blu,a,pn %XCC, 80f 63 or %o3, %o2, %o3 64 65 /* 2 blocks (128 bytes) is the minimum we can do the block 66 * copy with. We need to ensure that we'll iterate at least 67 * once in the block copy loop. At worst we'll need to align 68 * the destination to a 64-byte boundary which can chew up 69 * to (64 - 1) bytes from the length before we perform the 70 * block copy loop. 71 */ 72 cmp %o2, (2 * 64) 73 blu,pt %XCC, 70f 74 andcc %o3, 0x7, %g0 75 76 /* %o0: dst 77 * %o1: src 78 * %o2: len (known to be >= 128) 79 * 80 * The block copy loops will use %o4/%o5,%g2/%g3 as 81 * temporaries while copying the data. 82 */ 83 84 LOAD(prefetch, %o1, #one_read) 85 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi 86 87 /* Align destination on 64-byte boundary. */ 88 andcc %o0, (64 - 1), %o4 89 be,pt %XCC, 2f 90 sub %o4, 64, %o4 91 sub %g0, %o4, %o4 ! bytes to align dst 92 sub %o2, %o4, %o2 931: subcc %o4, 1, %o4 94 LOAD(ldub, %o1, %g1) 95 STORE(stb, %g1, %o0) 96 add %o1, 1, %o1 97 bne,pt %XCC, 1b 98 add %o0, 1, %o0 99 100 /* If the source is on a 16-byte boundary we can do 101 * the direct block copy loop. If it is 8-byte aligned 102 * we can do the 16-byte loads offset by -8 bytes and the 103 * init stores offset by one register. 104 * 105 * If the source is not even 8-byte aligned, we need to do 106 * shifting and masking (basically integer faligndata). 107 * 108 * The careful bit with init stores is that if we store 109 * to any part of the cache line we have to store the whole 110 * cacheline else we can end up with corrupt L2 cache line 111 * contents. Since the loop works on 64-bytes of 64-byte 112 * aligned store data at a time, this is easy to ensure. 113 */ 1142: 115 andcc %o1, (16 - 1), %o4 116 andn %o2, (64 - 1), %g1 ! block copy loop iterator 117 sub %o2, %g1, %o2 ! final sub-block copy bytes 118 be,pt %XCC, 50f 119 cmp %o4, 8 120 be,a,pt %XCC, 10f 121 sub %o1, 0x8, %o1 122 123 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 124 mov %g1, %o4 125 and %o1, 0x7, %g1 126 sll %g1, 3, %g1 127 mov 64, %o3 128 andn %o1, 0x7, %o1 129 LOAD(ldx, %o1, %g2) 130 sub %o3, %g1, %o3 131 sllx %g2, %g1, %g2 132 133#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\ 134 LOAD(ldx, SRC, TMP1); \ 135 srlx TMP1, PRE_SHIFT, TMP2; \ 136 or TMP2, PRE_VAL, TMP2; \ 137 STORE_INIT(TMP2, DST); \ 138 sllx TMP1, POST_SHIFT, PRE_VAL; 139 1401: add %o1, 0x8, %o1 141 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00) 142 add %o1, 0x8, %o1 143 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08) 144 add %o1, 0x8, %o1 145 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10) 146 add %o1, 0x8, %o1 147 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18) 148 add %o1, 32, %o1 149 LOAD(prefetch, %o1, #one_read) 150 sub %o1, 32 - 8, %o1 151 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20) 152 add %o1, 8, %o1 153 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28) 154 add %o1, 8, %o1 155 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30) 156 add %o1, 8, %o1 157 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38) 158 subcc %o4, 64, %o4 159 bne,pt %XCC, 1b 160 add %o0, 64, %o0 161 162#undef SWIVEL_ONE_DWORD 163 164 srl %g1, 3, %g1 165 ba,pt %XCC, 60f 166 add %o1, %g1, %o1 167 16810: /* Destination is 64-byte aligned, source was only 8-byte 169 * aligned but it has been subtracted by 8 and we perform 170 * one twin load ahead, then add 8 back into source when 171 * we finish the loop. 172 */ 173 LOAD_TWIN(%o1, %o4, %o5) 1741: add %o1, 16, %o1 175 LOAD_TWIN(%o1, %g2, %g3) 176 add %o1, 16 + 32, %o1 177 LOAD(prefetch, %o1, #one_read) 178 sub %o1, 32, %o1 179 STORE_INIT(%o5, %o0 + 0x00) ! initializes cache line 180 STORE_INIT(%g2, %o0 + 0x08) 181 LOAD_TWIN(%o1, %o4, %o5) 182 add %o1, 16, %o1 183 STORE_INIT(%g3, %o0 + 0x10) 184 STORE_INIT(%o4, %o0 + 0x18) 185 LOAD_TWIN(%o1, %g2, %g3) 186 add %o1, 16, %o1 187 STORE_INIT(%o5, %o0 + 0x20) 188 STORE_INIT(%g2, %o0 + 0x28) 189 LOAD_TWIN(%o1, %o4, %o5) 190 STORE_INIT(%g3, %o0 + 0x30) 191 STORE_INIT(%o4, %o0 + 0x38) 192 subcc %g1, 64, %g1 193 bne,pt %XCC, 1b 194 add %o0, 64, %o0 195 196 ba,pt %XCC, 60f 197 add %o1, 0x8, %o1 198 19950: /* Destination is 64-byte aligned, and source is 16-byte 200 * aligned. 201 */ 2021: LOAD_TWIN(%o1, %o4, %o5) 203 add %o1, 16, %o1 204 LOAD_TWIN(%o1, %g2, %g3) 205 add %o1, 16 + 32, %o1 206 LOAD(prefetch, %o1, #one_read) 207 sub %o1, 32, %o1 208 STORE_INIT(%o4, %o0 + 0x00) ! initializes cache line 209 STORE_INIT(%o5, %o0 + 0x08) 210 LOAD_TWIN(%o1, %o4, %o5) 211 add %o1, 16, %o1 212 STORE_INIT(%g2, %o0 + 0x10) 213 STORE_INIT(%g3, %o0 + 0x18) 214 LOAD_TWIN(%o1, %g2, %g3) 215 add %o1, 16, %o1 216 STORE_INIT(%o4, %o0 + 0x20) 217 STORE_INIT(%o5, %o0 + 0x28) 218 STORE_INIT(%g2, %o0 + 0x30) 219 STORE_INIT(%g3, %o0 + 0x38) 220 subcc %g1, 64, %g1 221 bne,pt %XCC, 1b 222 add %o0, 64, %o0 223 /* fall through */ 224 22560: 226 /* %o2 contains any final bytes still needed to be copied 227 * over. If anything is left, we copy it one byte at a time. 228 */ 229 wr %g0, ASI_PNF, %asi 230 brz,pt %o2, 85f 231 sub %o0, %o1, %o3 232 ba,a,pt %XCC, 90f 233 234 .align 64 23570: /* 16 < len <= 64 */ 236 bne,pn %XCC, 75f 237 sub %o0, %o1, %o3 238 23972: 240 andn %o2, 0xf, %o4 241 and %o2, 0xf, %o2 2421: subcc %o4, 0x10, %o4 243 LOAD(ldx, %o1, %o5) 244 add %o1, 0x08, %o1 245 LOAD(ldx, %o1, %g1) 246 sub %o1, 0x08, %o1 247 STORE(stx, %o5, %o1 + %o3) 248 add %o1, 0x8, %o1 249 STORE(stx, %g1, %o1 + %o3) 250 bgu,pt %XCC, 1b 251 add %o1, 0x8, %o1 25273: andcc %o2, 0x8, %g0 253 be,pt %XCC, 1f 254 nop 255 sub %o2, 0x8, %o2 256 LOAD(ldx, %o1, %o5) 257 STORE(stx, %o5, %o1 + %o3) 258 add %o1, 0x8, %o1 2591: andcc %o2, 0x4, %g0 260 be,pt %XCC, 1f 261 nop 262 sub %o2, 0x4, %o2 263 LOAD(lduw, %o1, %o5) 264 STORE(stw, %o5, %o1 + %o3) 265 add %o1, 0x4, %o1 2661: cmp %o2, 0 267 be,pt %XCC, 85f 268 nop 269 ba,pt %XCC, 90f 270 nop 271 27275: 273 andcc %o0, 0x7, %g1 274 sub %g1, 0x8, %g1 275 be,pn %icc, 2f 276 sub %g0, %g1, %g1 277 sub %o2, %g1, %o2 278 2791: subcc %g1, 1, %g1 280 LOAD(ldub, %o1, %o5) 281 STORE(stb, %o5, %o1 + %o3) 282 bgu,pt %icc, 1b 283 add %o1, 1, %o1 284 2852: add %o1, %o3, %o0 286 andcc %o1, 0x7, %g1 287 bne,pt %icc, 8f 288 sll %g1, 3, %g1 289 290 cmp %o2, 16 291 bgeu,pt %icc, 72b 292 nop 293 ba,a,pt %XCC, 73b 294 2958: mov 64, %o3 296 andn %o1, 0x7, %o1 297 LOAD(ldx, %o1, %g2) 298 sub %o3, %g1, %o3 299 andn %o2, 0x7, %o4 300 sllx %g2, %g1, %g2 3011: add %o1, 0x8, %o1 302 LOAD(ldx, %o1, %g3) 303 subcc %o4, 0x8, %o4 304 srlx %g3, %o3, %o5 305 or %o5, %g2, %o5 306 STORE(stx, %o5, %o0) 307 add %o0, 0x8, %o0 308 bgu,pt %icc, 1b 309 sllx %g3, %g1, %g2 310 311 srl %g1, 3, %g1 312 andcc %o2, 0x7, %o2 313 be,pn %icc, 85f 314 add %o1, %g1, %o1 315 ba,pt %XCC, 90f 316 sub %o0, %o1, %o3 317 318 .align 64 31980: /* 0 < len <= 16 */ 320 andcc %o3, 0x3, %g0 321 bne,pn %XCC, 90f 322 sub %o0, %o1, %o3 323 3241: 325 subcc %o2, 4, %o2 326 LOAD(lduw, %o1, %g1) 327 STORE(stw, %g1, %o1 + %o3) 328 bgu,pt %XCC, 1b 329 add %o1, 4, %o1 330 33185: retl 332 mov %g5, %o0 333 334 .align 32 33590: 336 subcc %o2, 1, %o2 337 LOAD(ldub, %o1, %g1) 338 STORE(stb, %g1, %o1 + %o3) 339 bgu,pt %XCC, 90b 340 add %o1, 1, %o1 341 retl 342 mov %g5, %o0 343 344END(__memcpy_niagara1) 345 346#endif 347