1/* Copy SIZE bytes from SRC to DEST. For SUN4V Niagara-2. 2 Copyright (C) 2007-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 22#define ASI_BLK_P 0xf0 23#define ASI_P 0x80 24#define ASI_PNF 0x82 25 26#define FPRS_FEF 0x04 27 28#define VISEntryHalf \ 29 rd %fprs, %o5; \ 30 wr %g0, FPRS_FEF, %fprs 31 32#define VISExitHalf \ 33 and %o5, FPRS_FEF, %o5; \ 34 wr %o5, 0x0, %fprs 35 36#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 37 38#define LOAD(type,addr,dest) type [addr], dest 39#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest 40#define STORE(type,src,addr) type src, [addr] 41#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 42#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 43 44#ifndef XCC 45#define USE_BPR 46#define XCC xcc 47#endif 48 49#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \ 50 faligndata %x0, %x1, %f0; \ 51 faligndata %x1, %x2, %f2; \ 52 faligndata %x2, %x3, %f4; \ 53 faligndata %x3, %x4, %f6; \ 54 faligndata %x4, %x5, %f8; \ 55 faligndata %x5, %x6, %f10; \ 56 faligndata %x6, %x7, %f12; \ 57 faligndata %x7, %x8, %f14; 58 59#define FREG_MOVE_1(x0) \ 60 fsrc2 %x0, %f0; 61#define FREG_MOVE_2(x0, x1) \ 62 fsrc2 %x0, %f0; \ 63 fsrc2 %x1, %f2; 64#define FREG_MOVE_3(x0, x1, x2) \ 65 fsrc2 %x0, %f0; \ 66 fsrc2 %x1, %f2; \ 67 fsrc2 %x2, %f4; 68#define FREG_MOVE_4(x0, x1, x2, x3) \ 69 fsrc2 %x0, %f0; \ 70 fsrc2 %x1, %f2; \ 71 fsrc2 %x2, %f4; \ 72 fsrc2 %x3, %f6; 73#define FREG_MOVE_5(x0, x1, x2, x3, x4) \ 74 fsrc2 %x0, %f0; \ 75 fsrc2 %x1, %f2; \ 76 fsrc2 %x2, %f4; \ 77 fsrc2 %x3, %f6; \ 78 fsrc2 %x4, %f8; 79#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \ 80 fsrc2 %x0, %f0; \ 81 fsrc2 %x1, %f2; \ 82 fsrc2 %x2, %f4; \ 83 fsrc2 %x3, %f6; \ 84 fsrc2 %x4, %f8; \ 85 fsrc2 %x5, %f10; 86#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \ 87 fsrc2 %x0, %f0; \ 88 fsrc2 %x1, %f2; \ 89 fsrc2 %x2, %f4; \ 90 fsrc2 %x3, %f6; \ 91 fsrc2 %x4, %f8; \ 92 fsrc2 %x5, %f10; \ 93 fsrc2 %x6, %f12; 94#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \ 95 fsrc2 %x0, %f0; \ 96 fsrc2 %x1, %f2; \ 97 fsrc2 %x2, %f4; \ 98 fsrc2 %x3, %f6; \ 99 fsrc2 %x4, %f8; \ 100 fsrc2 %x5, %f10; \ 101 fsrc2 %x6, %f12; \ 102 fsrc2 %x7, %f14; 103#define FREG_LOAD_1(base, x0) \ 104 LOAD(ldd, base + 0x00, %x0) 105#define FREG_LOAD_2(base, x0, x1) \ 106 LOAD(ldd, base + 0x00, %x0); \ 107 LOAD(ldd, base + 0x08, %x1); 108#define FREG_LOAD_3(base, x0, x1, x2) \ 109 LOAD(ldd, base + 0x00, %x0); \ 110 LOAD(ldd, base + 0x08, %x1); \ 111 LOAD(ldd, base + 0x10, %x2); 112#define FREG_LOAD_4(base, x0, x1, x2, x3) \ 113 LOAD(ldd, base + 0x00, %x0); \ 114 LOAD(ldd, base + 0x08, %x1); \ 115 LOAD(ldd, base + 0x10, %x2); \ 116 LOAD(ldd, base + 0x18, %x3); 117#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \ 118 LOAD(ldd, base + 0x00, %x0); \ 119 LOAD(ldd, base + 0x08, %x1); \ 120 LOAD(ldd, base + 0x10, %x2); \ 121 LOAD(ldd, base + 0x18, %x3); \ 122 LOAD(ldd, base + 0x20, %x4); 123#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \ 124 LOAD(ldd, base + 0x00, %x0); \ 125 LOAD(ldd, base + 0x08, %x1); \ 126 LOAD(ldd, base + 0x10, %x2); \ 127 LOAD(ldd, base + 0x18, %x3); \ 128 LOAD(ldd, base + 0x20, %x4); \ 129 LOAD(ldd, base + 0x28, %x5); 130#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \ 131 LOAD(ldd, base + 0x00, %x0); \ 132 LOAD(ldd, base + 0x08, %x1); \ 133 LOAD(ldd, base + 0x10, %x2); \ 134 LOAD(ldd, base + 0x18, %x3); \ 135 LOAD(ldd, base + 0x20, %x4); \ 136 LOAD(ldd, base + 0x28, %x5); \ 137 LOAD(ldd, base + 0x30, %x6); 138 139#if IS_IN (libc) 140 141 .register %g2,#scratch 142 .register %g3,#scratch 143 .register %g6,#scratch 144 145 .text 146 147ENTRY(__mempcpy_niagara2) 148 ba,pt %XCC, 101f 149 add %o0, %o2, %g5 150END(__mempcpy_niagara2) 151 152 .align 32 153ENTRY(__memcpy_niagara2) 154100: /* %o0=dst, %o1=src, %o2=len */ 155 mov %o0, %g5 156101: 157# ifndef USE_BPR 158 srl %o2, 0, %o2 159# endif 160 cmp %o2, 0 161 be,pn %XCC, 85f 162218: or %o0, %o1, %o3 163 cmp %o2, 16 164 blu,a,pn %XCC, 80f 165 or %o3, %o2, %o3 166 167 /* 2 blocks (128 bytes) is the minimum we can do the block 168 * copy with. We need to ensure that we'll iterate at least 169 * once in the block copy loop. At worst we'll need to align 170 * the destination to a 64-byte boundary which can chew up 171 * to (64 - 1) bytes from the length before we perform the 172 * block copy loop. 173 * 174 * However, the cut-off point, performance wise, is around 175 * 4 64-byte blocks. 176 */ 177 cmp %o2, (4 * 64) 178 blu,pt %XCC, 75f 179 andcc %o3, 0x7, %g0 180 181 /* %o0: dst 182 * %o1: src 183 * %o2: len (known to be >= 128) 184 * 185 * The block copy loops can use %o4, %g2, %g3 as 186 * temporaries while copying the data. %o5 must 187 * be preserved between VISEntryHalf and VISExitHalf 188 */ 189 190 LOAD(prefetch, %o1 + 0x000, #one_read) 191 LOAD(prefetch, %o1 + 0x040, #one_read) 192 LOAD(prefetch, %o1 + 0x080, #one_read) 193 194 /* Align destination on 64-byte boundary. */ 195 andcc %o0, (64 - 1), %o4 196 be,pt %XCC, 2f 197 sub %o4, 64, %o4 198 sub %g0, %o4, %o4 ! bytes to align dst 199 sub %o2, %o4, %o2 2001: subcc %o4, 1, %o4 201 LOAD(ldub, %o1, %g1) 202 STORE(stb, %g1, %o0) 203 add %o1, 1, %o1 204 bne,pt %XCC, 1b 205 add %o0, 1, %o0 206 2072: 208 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 209 * o5 from here until we hit VISExitHalf. 210 */ 211 VISEntryHalf 212 213 membar #Sync 214 alignaddr %o1, %g0, %g0 215 216 add %o1, (64 - 1), %o4 217 andn %o4, (64 - 1), %o4 218 andn %o2, (64 - 1), %g1 219 sub %o2, %g1, %o2 220 221 and %o1, (64 - 1), %g2 222 add %o1, %g1, %o1 223 sub %o0, %o4, %g3 224 brz,pt %g2, 190f 225 cmp %g2, 32 226 blu,a 5f 227 cmp %g2, 16 228 cmp %g2, 48 229 blu,a 4f 230 cmp %g2, 40 231 cmp %g2, 56 232 blu 170f 233 nop 234 ba,a,pt %xcc, 180f 235 2364: /* 32 <= low bits < 48 */ 237 blu 150f 238 nop 239 ba,a,pt %xcc, 160f 2405: /* 0 < low bits < 32 */ 241 blu,a 6f 242 cmp %g2, 8 243 cmp %g2, 24 244 blu 130f 245 nop 246 ba,a,pt %xcc, 140f 2476: /* 0 < low bits < 16 */ 248 bgeu 120f 249 nop 250 /* fall through for 0 < low bits < 8 */ 251110: sub %o4, 64, %g2 252 LOAD_BLK(%g2, %f0) 2531: STORE_INIT(%g0, %o4 + %g3) 254 LOAD_BLK(%o4, %f16) 255 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16) 256 STORE_BLK(%f0, %o4 + %g3) 257 FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30) 258 subcc %g1, 64, %g1 259 add %o4, 64, %o4 260 bne,pt %XCC, 1b 261 LOAD(prefetch, %o4 + 64, #one_read) 262 ba,pt %xcc, 195f 263 nop 264 265120: sub %o4, 56, %g2 266 FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12) 2671: STORE_INIT(%g0, %o4 + %g3) 268 LOAD_BLK(%o4, %f16) 269 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18) 270 STORE_BLK(%f0, %o4 + %g3) 271 FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30) 272 subcc %g1, 64, %g1 273 add %o4, 64, %o4 274 bne,pt %XCC, 1b 275 LOAD(prefetch, %o4 + 64, #one_read) 276 ba,pt %xcc, 195f 277 nop 278 279130: sub %o4, 48, %g2 280 FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10) 2811: STORE_INIT(%g0, %o4 + %g3) 282 LOAD_BLK(%o4, %f16) 283 FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20) 284 STORE_BLK(%f0, %o4 + %g3) 285 FREG_MOVE_6(f20, f22, f24, f26, f28, f30) 286 subcc %g1, 64, %g1 287 add %o4, 64, %o4 288 bne,pt %XCC, 1b 289 LOAD(prefetch, %o4 + 64, #one_read) 290 ba,pt %xcc, 195f 291 nop 292 293140: sub %o4, 40, %g2 294 FREG_LOAD_5(%g2, f0, f2, f4, f6, f8) 2951: STORE_INIT(%g0, %o4 + %g3) 296 LOAD_BLK(%o4, %f16) 297 FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22) 298 STORE_BLK(%f0, %o4 + %g3) 299 FREG_MOVE_5(f22, f24, f26, f28, f30) 300 subcc %g1, 64, %g1 301 add %o4, 64, %o4 302 bne,pt %XCC, 1b 303 LOAD(prefetch, %o4 + 64, #one_read) 304 ba,pt %xcc, 195f 305 nop 306 307150: sub %o4, 32, %g2 308 FREG_LOAD_4(%g2, f0, f2, f4, f6) 3091: STORE_INIT(%g0, %o4 + %g3) 310 LOAD_BLK(%o4, %f16) 311 FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24) 312 STORE_BLK(%f0, %o4 + %g3) 313 FREG_MOVE_4(f24, f26, f28, f30) 314 subcc %g1, 64, %g1 315 add %o4, 64, %o4 316 bne,pt %XCC, 1b 317 LOAD(prefetch, %o4 + 64, #one_read) 318 ba,pt %xcc, 195f 319 nop 320 321160: sub %o4, 24, %g2 322 FREG_LOAD_3(%g2, f0, f2, f4) 3231: STORE_INIT(%g0, %o4 + %g3) 324 LOAD_BLK(%o4, %f16) 325 FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26) 326 STORE_BLK(%f0, %o4 + %g3) 327 FREG_MOVE_3(f26, f28, f30) 328 subcc %g1, 64, %g1 329 add %o4, 64, %o4 330 bne,pt %XCC, 1b 331 LOAD(prefetch, %o4 + 64, #one_read) 332 ba,pt %xcc, 195f 333 nop 334 335170: sub %o4, 16, %g2 336 FREG_LOAD_2(%g2, f0, f2) 3371: STORE_INIT(%g0, %o4 + %g3) 338 LOAD_BLK(%o4, %f16) 339 FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28) 340 STORE_BLK(%f0, %o4 + %g3) 341 FREG_MOVE_2(f28, f30) 342 subcc %g1, 64, %g1 343 add %o4, 64, %o4 344 bne,pt %XCC, 1b 345 LOAD(prefetch, %o4 + 64, #one_read) 346 ba,pt %xcc, 195f 347 nop 348 349180: sub %o4, 8, %g2 350 FREG_LOAD_1(%g2, f0) 3511: STORE_INIT(%g0, %o4 + %g3) 352 LOAD_BLK(%o4, %f16) 353 FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30) 354 STORE_BLK(%f0, %o4 + %g3) 355 FREG_MOVE_1(f30) 356 subcc %g1, 64, %g1 357 add %o4, 64, %o4 358 bne,pt %XCC, 1b 359 LOAD(prefetch, %o4 + 64, #one_read) 360 ba,pt %xcc, 195f 361 nop 362 363190: 3641: STORE_INIT(%g0, %o4 + %g3) 365 subcc %g1, 64, %g1 366 LOAD_BLK(%o4, %f0) 367 STORE_BLK(%f0, %o4 + %g3) 368 add %o4, 64, %o4 369 bne,pt %XCC, 1b 370 LOAD(prefetch, %o4 + 64, #one_read) 371 372195: 373 add %o4, %g3, %o0 374 membar #Sync 375 376 VISExitHalf 377 378 /* %o2 contains any final bytes still needed to be copied 379 * over. If anything is left, we copy it one byte at a time. 380 */ 381 brz,pt %o2, 85f 382 sub %o0, %o1, %o3 383 ba,a,pt %XCC, 90f 384 385 .align 64 38675: /* 16 < len <= 64 */ 387 bne,pn %XCC, 75f 388 sub %o0, %o1, %o3 389 39072: 391 andn %o2, 0xf, %o4 392 and %o2, 0xf, %o2 3931: subcc %o4, 0x10, %o4 394 LOAD(ldx, %o1, %o5) 395 add %o1, 0x08, %o1 396 LOAD(ldx, %o1, %g1) 397 sub %o1, 0x08, %o1 398 STORE(stx, %o5, %o1 + %o3) 399 add %o1, 0x8, %o1 400 STORE(stx, %g1, %o1 + %o3) 401 bgu,pt %XCC, 1b 402 add %o1, 0x8, %o1 40373: andcc %o2, 0x8, %g0 404 be,pt %XCC, 1f 405 nop 406 sub %o2, 0x8, %o2 407 LOAD(ldx, %o1, %o5) 408 STORE(stx, %o5, %o1 + %o3) 409 add %o1, 0x8, %o1 4101: andcc %o2, 0x4, %g0 411 be,pt %XCC, 1f 412 nop 413 sub %o2, 0x4, %o2 414 LOAD(lduw, %o1, %o5) 415 STORE(stw, %o5, %o1 + %o3) 416 add %o1, 0x4, %o1 4171: cmp %o2, 0 418 be,pt %XCC, 85f 419 nop 420 ba,pt %xcc, 90f 421 nop 422 42375: 424 andcc %o0, 0x7, %g1 425 sub %g1, 0x8, %g1 426 be,pn %icc, 2f 427 sub %g0, %g1, %g1 428 sub %o2, %g1, %o2 429 4301: subcc %g1, 1, %g1 431 LOAD(ldub, %o1, %o5) 432 STORE(stb, %o5, %o1 + %o3) 433 bgu,pt %icc, 1b 434 add %o1, 1, %o1 435 4362: add %o1, %o3, %o0 437 andcc %o1, 0x7, %g1 438 bne,pt %icc, 8f 439 sll %g1, 3, %g1 440 441 cmp %o2, 16 442 bgeu,pt %icc, 72b 443 nop 444 ba,a,pt %xcc, 73b 445 4468: mov 64, %o3 447 andn %o1, 0x7, %o1 448 LOAD(ldx, %o1, %g2) 449 sub %o3, %g1, %o3 450 andn %o2, 0x7, %o4 451 sllx %g2, %g1, %g2 4521: add %o1, 0x8, %o1 453 LOAD(ldx, %o1, %g3) 454 subcc %o4, 0x8, %o4 455 srlx %g3, %o3, %o5 456 or %o5, %g2, %o5 457 STORE(stx, %o5, %o0) 458 add %o0, 0x8, %o0 459 bgu,pt %icc, 1b 460 sllx %g3, %g1, %g2 461 462 srl %g1, 3, %g1 463 andcc %o2, 0x7, %o2 464 be,pn %icc, 85f 465 add %o1, %g1, %o1 466 ba,pt %xcc, 90f 467 sub %o0, %o1, %o3 468 469 .align 64 47080: /* 0 < len <= 16 */ 471 andcc %o3, 0x3, %g0 472 bne,pn %XCC, 90f 473 sub %o0, %o1, %o3 474 4751: 476 subcc %o2, 4, %o2 477 LOAD(lduw, %o1, %g1) 478 STORE(stw, %g1, %o1 + %o3) 479 bgu,pt %XCC, 1b 480 add %o1, 4, %o1 481 48285: retl 483 mov %g5, %o0 484 485 .align 32 48690: 487 subcc %o2, 1, %o2 488 LOAD(ldub, %o1, %g1) 489 STORE(stb, %g1, %o1 + %o3) 490 bgu,pt %XCC, 90b 491 add %o1, 1, %o1 492 retl 493 mov %g5, %o0 494 495END(__memcpy_niagara2) 496 497#endif 498