1/* Copy SIZE bytes from SRC to DEST. For SUN4V M7. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef XCC 22# define XCC xcc 23#endif 24 .register %g2,#scratch 25 .register %g3,#scratch 26 .register %g6,#scratch 27 28#define FPRS_FEF 0x04 29 30/* 31 * ASI_STBI_P marks the cache line as "least recently used" 32 * which means if many threads are active, it has a high chance 33 * of being pushed out of the cache between the first initializing 34 * store and the final stores. 35 * Thus, in this algorithm we use ASI_STBIMRU_P which marks the 36 * cache line as "most recently used" for all but the last cache 37 * line. 38 */ 39 40#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 41#define ASI_ST_BLK_INIT_MRU_P 0xf2 42 43#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 44#define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P 45 46#define BLOCK_SIZE 64 /* L2 data cache line size */ 47#define SHORTCOPY 3 48#define SHORTCHECK 14 49#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 50 /* must be at least 64 */ 51#define SMALL_MAX 255 /* max small copy for word/long aligned */ 52#define SMALL_UMAX 128 /* max small copy for unaligned case */ 53#define MED_WMAX 1023 /* max copy for medium word-aligned case */ 54#define MED_MAX 511 /* max copy for medium longword-aligned case */ 55#define ST_CHUNK 20 /* ST_CHUNK - block of values for BIS Store */ 56/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache 57 * prefetch 20 can cause inst pipeline to delay if data is in memory 58 * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache */ 59#define ALIGN_PRE 20 /* distance for aligned prefetch loop */ 60 61#define EX_ST(x) x 62#define EX_RETVAL(x) x 63#define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P 64#define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P 65 66#if IS_IN (libc) 67 68 .text 69 70ENTRY(__memmove_niagara7) 71 /* %o0=dst, %o1=src, %o2=len */ 72 cmp %o1, %o0 /* if from address is >= to use forward copy */ 73 bgeu,pn %XCC, .Lforcpy /* else use backward if ... */ 74 sub %o0, %o1, %o4 /* get difference of two addresses */ 75 cmp %o2, %o4 /* compare size and difference of addresses */ 76 bleu,pn %XCC, .Lforcpy /* if size is bigger, do overlapped copy */ 77 add %o1, %o2, %o5 /* get to end of source space */ 78 79/* an overlapped copy that must be done "backwards" */ 80.Lchksize: 81 cmp %o2, 8 /* less than 8 byte do byte copy */ 82 blu,pn %XCC, 2f /* else continue */ 83 84/* Now size is bigger than 8 */ 85.Ldbalign: 86 add %o0, %o2, %g1 /* get to end of dest space */ 87 andcc %g1, 7, %o3 /* %o3 has cnt til dst 8 byte align */ 88 bz,a,pn %XCC, .Ldbbck /* skip if dst is 8 byte aligned */ 89 andn %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ 90 sub %o2, %o3, %o2 /* update o2 with new count */ 91 921: dec %o5 /* decrement source */ 93 ldub [%o5], %g1 /* load one byte */ 94 deccc %o3 /* decrement count */ 95 bgu,pt %XCC, 1b /* if not done keep copying */ 96 stb %g1, [%o5+%o4] /* store one byte into dest */ 97 andncc %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ 98 bz,pn %XCC, 2f /* if size < 8, move to byte copy */ 99 100/* Now Destination is 8 byte aligned */ 101.Ldbbck: 102 andcc %o5, 7, %o0 /* %o0 has src offset */ 103 bz,a,pn %XCC, .Ldbcopybc /* if src is aligned do fast memmove */ 104 sub %o2, %o3, %o2 /* Residue bytes in %o2 */ 105 106.Lcpy_dbwdbc: /* alignment of src is needed */ 107 sub %o2, 8, %o2 /* set size one loop ahead */ 108 sll %o0, 3, %g1 /* %g1 is left shift */ 109 mov 64, %g5 /* init %g5 to be 64 */ 110 sub %g5, %g1, %g5 /* %g5 rightshift = (64 - leftshift) */ 111 sub %o5, %o0, %o5 /* align the src at 8 bytes. */ 112 add %o4, %o0, %o4 /* increase diff between src & dst */ 113 ldx [%o5], %o1 /* load first 8 bytes */ 114 srlx %o1, %g5, %o1 1151: sub %o5, 8, %o5 /* subtract 8 from src */ 116 ldx [%o5], %o0 /* load 8 byte */ 117 sllx %o0, %g1, %o3 /* shift loaded val left to tmp reg */ 118 or %o1, %o3, %o3 /* align data */ 119 stx %o3, [%o5+%o4] /* store 8 byte */ 120 subcc %o2, 8, %o2 /* subtract 8 byte from size */ 121 bg,pt %XCC, 1b /* if size > 0 continue */ 122 srlx %o0, %g5, %o1 /* move extra byte for the next use */ 123 124 srl %g1, 3, %o0 /* restore %o0 value for alignment */ 125 add %o5, %o0, %o5 /* restore src alignment */ 126 sub %o4, %o0, %o4 /* restore diff between src & dest */ 127 128 ba 2f /* branch to the trailing byte copy */ 129 add %o2, 8, %o2 /* restore size value */ 130 131.Ldbcopybc: /* alignment of src is not needed */ 1321: sub %o5, 8, %o5 /* subtract from src */ 133 ldx [%o5], %g1 /* load 8 bytes */ 134 subcc %o3, 8, %o3 /* subtract from size */ 135 bgu,pt %XCC, 1b /* if size is bigger 0 continue */ 136 stx %g1, [%o5+%o4] /* store 8 bytes to destination */ 137 138 ba 2f 139 nop 140 141.Lbcbyte: 1421: ldub [%o5], %g1 /* load one byte */ 143 stb %g1, [%o5+%o4] /* store one byte */ 1442: deccc %o2 /* decrement size */ 145 bgeu,a,pt %XCC, 1b /* if size is >= 0 continue */ 146 dec %o5 /* decrement from address */ 147 148.Lexitbc: /* exit from backward copy */ 149 retl 150 add %o5, %o4, %o0 /* restore dest addr */ 151 152 153/* Check to see if memmove is large aligned copy 154 * If so, use special version of copy that avoids 155 * use of block store init. */ 156.Lforcpy: 157 cmp %o2, SMALL_MAX /* check for not small case */ 158 blt,pn %XCC, .Lmv_short /* merge with memcpy */ 159 mov %o0, %g1 /* save %o0 */ 160 neg %o0, %o5 161 andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ 162 brz,pt %o5, .Lmv_dst_aligned_on_8 163 164/* %o5 has the bytes to be written in partial store. */ 165 sub %o2, %o5, %o2 166 sub %o1, %o0, %o1 /* %o1 gets the difference */ 1677: /* dst aligning loop */ 168 ldub [%o1+%o0], %o4 /* load one byte */ 169 subcc %o5, 1, %o5 170 stb %o4, [%o0] 171 bgu,pt %XCC, 7b 172 add %o0, 1, %o0 /* advance dst */ 173 add %o1, %o0, %o1 /* restore %o1 */ 174.Lmv_dst_aligned_on_8: 175 andcc %o1, 7, %o5 176 brnz,pn %o5, .Lsrc_dst_unaligned_on_8 177 prefetch [%o1 + (1 * BLOCK_SIZE)], 20 178 179.Lmv_src_dst_aligned_on_8: 180/* check if we are copying MED_MAX or more bytes */ 181 cmp %o2, MED_MAX /* limit to store buffer size */ 182 bleu,pt %XCC, .Lmedlong 183 prefetch [%o1 + (2 * BLOCK_SIZE)], 20 184 185/* The mv_align loop below mimics the memcpy code for large aligned copies, 186 * but does not use the ASI_STBI_P (block initializing store) performance 187 * optimization. This is used when memcpy is incorrectly invoked with 188 * overlapping buffers. */ 189 190.Lmv_large_align8_copy: /* Src and dst share 8 byte align */ 191 /* align dst to 64 byte boundary */ 192 andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ 193 brz,pn %o3, .Lmv_aligned_on_64 194 sub %o3, 64, %o3 /* %o3 has negative bytes to move */ 195 add %o2, %o3, %o2 /* adjust remaining count */ 196.Lmv_align_to_64: 197 ldx [%o1], %o4 198 add %o1, 8, %o1 /* increment src ptr */ 199 addcc %o3, 8, %o3 200 stx %o4, [%o0] 201 brnz,pt %o3, .Lmv_align_to_64 202 add %o0, 8, %o0 /* increment dst ptr */ 203 204.Lmv_aligned_on_64: 205 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ 206 and %o2, 0x3f, %o2 /* residue bytes in %o2 */ 207.Lmv_align_loop: 208 ldx [%o1],%o4 209 stx %o4,[%o0] 210 prefetch [%o0 + (10 * BLOCK_SIZE)], 22 211 prefetch [%o1 + (10 * BLOCK_SIZE)], 21 212 subcc %o5, 64, %o5 213 ldx [%o1+8],%o4 214 stx %o4,[%o0+8] 215 ldx [%o1+16],%o4 216 stx %o4,[%o0+16] 217 ldx [%o1+24],%o4 218 stx %o4,[%o0+24] 219 ldx [%o1+32],%o4 220 stx %o4,[%o0+32] 221 ldx [%o1+40],%o4 222 stx %o4,[%o0+40] 223 ldx [%o1+48],%o4 224 add %o1, 64, %o1 225 stx %o4,[%o0+48] 226 add %o0, 64, %o0 227 ldx [%o1-8],%o4 228 bgt,pt %XCC, .Lmv_align_loop 229 stx %o4,[%o0-8] 230 231 ba .Lmedlong 232 nop 233END(__memmove_niagara7) 234 235ENTRY(__mempcpy_niagara7) 236 /* %o0=dst, %o1=src, %o2=len */ 237 ba,pt %icc, 101f 238 add %o0, %o2, %g1 /* save dst + len */ 239END(__mempcpy_niagara7) 240 241 .align 32 242ENTRY(__memcpy_niagara7) 243100: /* %o0=dst, %o1=src, %o2=len */ 244 mov %o0, %g1 /* save %o0 */ 245101: 246#ifndef __arch64__ 247 srl %o2, 0, %o2 248#endif 249 cmp %o2, SMALL_MAX /* check for not small case */ 250 bgeu,pn %XCC, .Lmedium /* go to larger cases */ 251.Lmv_short: 252 cmp %o2, SHORTCOPY /* check for really short case */ 253 ble,pn %XCC, .Lsmallfin 254 or %o0, %o1, %o4 /* prepare alignment check */ 255 andcc %o4, 0x3, %o5 /* test for word alignment */ 256 bnz,pn %XCC, .Lsmallunalign /* branch to non-word aligned case */ 257 nop 258 subcc %o2, 7, %o2 /* adjust count */ 259 ble,pn %XCC, .Lsmallwordx 260 andcc %o4, 0x7, %o5 /* test for long alignment */ 261/* 8 or more bytes, src and dest start on word boundary 262 * %o4 contains or %o0, %o1 */ 263.Lsmalllong: 264 bnz,pn %XCC, .Lsmallwords /* branch to word aligned case */ 265 cmp %o2, SHORT_LONG-7 266 bge,a %XCC, .Lmedl64 /* if we branch */ 267 sub %o2,56,%o2 /* adjust %o2 to -63 off count */ 268 269/* slightly unroll the small_long_loop to improve very short copies */ 270 cmp %o2, 32-7 271 blt,a,pn %XCC, .Lsmall_long_l 272 sub %o1, %o0, %o1 /* %o1 gets the difference */ 273 274 ldx [%o1], %o5 275 ldx [%o1+8], %o4 276 ldx [%o1+16], %o3 277 278 subcc %o2, 24, %o2 279 sub %o1, %o0, %o1 /* %o1 gets the difference */ 280 281 stx %o5, [%o0] /* write word */ 282 stx %o4, [%o0+8] /* write word */ 283 stx %o3, [%o0+16] /* write word */ 284 285 add %o0, 24, %o0 286 287/* end loop unroll */ 288 289.Lsmall_long_l: 290 ldx [%o1+%o0], %o3 291 subcc %o2, 8, %o2 292 add %o0, 8, %o0 293 bgu,pn %XCC, .Lsmall_long_l /* loop until done */ 294 stx %o3, [%o0-8] /* write word */ 295 addcc %o2, 7, %o2 /* restore %o2 to correct count */ 296 bnz,pn %XCC, .Lsmall_long_x /* check for completion */ 297 add %o1, %o0, %o1 /* restore %o1 */ 298 retl 299 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 300.Lsmall_long_x: 301 cmp %o2, 4 /* check for 4 or more bytes left */ 302 blt,pn %XCC, .Lsmallleft3 /* if not, go to finish up */ 303 nop 304 lduw [%o1], %o3 305 add %o1, 4, %o1 306 subcc %o2, 4, %o2 307 stw %o3, [%o0] 308 bnz,pn %XCC, .Lsmallleft3 309 add %o0, 4, %o0 310 retl 311 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 312 313 .align 32 314/* src and dest start on word boundary; 7 or fewer bytes */ 315.Lsmallwordx: 316 lduw [%o1], %o3 /* read word */ 317 addcc %o2, 3, %o2 /* restore count */ 318 bz,pt %XCC, .Lsmallexit 319 stw %o3, [%o0] /* write word */ 320 deccc %o2 /* reduce count for cc test */ 321 ldub [%o1+4], %o3 /* load one byte */ 322 bz,pt %XCC, .Lsmallexit 323 stb %o3, [%o0+4] /* store one byte */ 324 ldub [%o1+5], %o3 /* load second byte */ 325 deccc %o2 326 bz,pt %XCC, .Lsmallexit 327 stb %o3, [%o0+5] /* store second byte */ 328 ldub [%o1+6], %o3 /* load third byte */ 329 stb %o3, [%o0+6] /* store third byte */ 330.Lsmallexit: 331 retl 332 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 333 334 .align 32 335.Lsmallunalign: 336 cmp %o2, SHORTCHECK 337 ble,pn %XCC, .Lsmallrest 338 cmp %o2, SMALL_UMAX 339 bge,pt %XCC, .Lmedium_join 340 andcc %o1, 0x3, %o5 /* is src word aligned */ 341 bz,pn %XCC, .Laldst 342 cmp %o5, 2 /* is src half-word aligned */ 343 be,pt %XCC, .Ls2algn 344 cmp %o5, 3 /* src is byte aligned */ 345.Ls1algn: 346 ldub [%o1], %o3 /* move 1 or 3 bytes to align it */ 347 inc 1, %o1 348 stb %o3, [%o0] /* move a byte to align src */ 349 inc 1, %o0 350 bne,pt %XCC, .Ls2algn 351 dec %o2 352 b .Lald /* now go align dest */ 353 andcc %o0, 0x3, %o5 354 355.Ls2algn: 356 lduh [%o1], %o3 /* know src is 2 byte aligned */ 357 inc 2, %o1 358 srl %o3, 8, %o4 359 stb %o4, [%o0] /* have to do bytes, */ 360 stb %o3, [%o0 + 1] /* do not know dst alignment */ 361 inc 2, %o0 362 dec 2, %o2 363 364.Laldst: 365 andcc %o0, 0x3, %o5 /* align the destination address */ 366.Lald: 367 bz,pn %XCC, .Lw4cp 368 cmp %o5, 2 369 be,pn %XCC, .Lw2cp 370 cmp %o5, 3 371.Lw3cp: lduw [%o1], %o4 372 inc 4, %o1 373 srl %o4, 24, %o5 374 stb %o5, [%o0] 375 bne,pt %XCC, .Lw1cp 376 inc %o0 377 dec 1, %o2 378 andn %o2, 3, %o3 /* %o3 is aligned word count */ 379 dec 4, %o3 /* avoid reading beyond tail of src */ 380 sub %o1, %o0, %o1 /* %o1 gets the difference */ 381 3821: sll %o4, 8, %g5 /* save residual bytes */ 383 lduw [%o1+%o0], %o4 384 deccc 4, %o3 385 srl %o4, 24, %o5 /* merge with residual */ 386 or %o5, %g5, %g5 387 st %g5, [%o0] 388 bnz,pt %XCC, 1b 389 inc 4, %o0 390 sub %o1, 3, %o1 /* used one byte of last word read */ 391 and %o2, 3, %o2 392 b 7f 393 inc 4, %o2 394 395.Lw1cp: srl %o4, 8, %o5 396 sth %o5, [%o0] 397 inc 2, %o0 398 dec 3, %o2 399 andn %o2, 3, %o3 /* %o3 is aligned word count */ 400 dec 4, %o3 /* avoid reading beyond tail of src */ 401 sub %o1, %o0, %o1 /* %o1 gets the difference */ 402 4032: sll %o4, 24, %g5 /* save residual bytes */ 404 lduw [%o1+%o0], %o4 405 deccc 4, %o3 406 srl %o4, 8, %o5 /* merge with residual */ 407 or %o5, %g5, %g5 408 st %g5, [%o0] 409 bnz,pt %XCC, 2b 410 inc 4, %o0 411 sub %o1, 1, %o1 /* used 3 bytes of last word read */ 412 and %o2, 3, %o2 413 b 7f 414 inc 4, %o2 415 416.Lw2cp: lduw [%o1], %o4 417 inc 4, %o1 418 srl %o4, 16, %o5 419 sth %o5, [%o0] 420 inc 2, %o0 421 dec 2, %o2 422 andn %o2, 3, %o3 /* %o3 is aligned word count */ 423 dec 4, %o3 /* avoid reading beyond tail of src */ 424 sub %o1, %o0, %o1 /* %o1 gets the difference */ 425 4263: sll %o4, 16, %g5 /* save residual bytes */ 427 lduw [%o1+%o0], %o4 428 deccc 4, %o3 429 srl %o4, 16, %o5 /* merge with residual */ 430 or %o5, %g5, %g5 431 st %g5, [%o0] 432 bnz,pt %XCC, 3b 433 inc 4, %o0 434 sub %o1, 2, %o1 /* used two bytes of last word read */ 435 and %o2, 3, %o2 436 b 7f 437 inc 4, %o2 438 439.Lw4cp: andn %o2, 3, %o3 /* %o3 is aligned word count */ 440 sub %o1, %o0, %o1 /* %o1 gets the difference */ 441 4421: lduw [%o1+%o0], %o4 /* read from address */ 443 deccc 4, %o3 /* decrement count */ 444 st %o4, [%o0] /* write at destination address */ 445 bgu,pt %XCC, 1b 446 inc 4, %o0 /* increment to address */ 447 and %o2, 3, %o2 /* number of leftover bytes, if any */ 448 449 /* simple finish up byte copy, works with any alignment */ 4507: 451 add %o1, %o0, %o1 /* restore %o1 */ 452.Lsmallrest: 453 tst %o2 454 bz,pt %XCC, .Lsmallx 455 cmp %o2, 4 456 blt,pn %XCC, .Lsmallleft3 457 nop 458 sub %o2, 3, %o2 459.Lsmallnotalign4: 460 ldub [%o1], %o3 /* read byte */ 461 subcc %o2, 4, %o2 /* reduce count by 4 */ 462 stb %o3, [%o0] /* write byte */ 463 ldub [%o1+1], %o3 /* repeat for total of 4 bytes */ 464 add %o1, 4, %o1 /* advance SRC by 4 */ 465 stb %o3, [%o0+1] 466 ldub [%o1-2], %o3 467 add %o0, 4, %o0 /* advance DST by 4 */ 468 stb %o3, [%o0-2] 469 ldub [%o1-1], %o3 470 bgu,pt %XCC, .Lsmallnotalign4 /* loop til 3 or fewer bytes remain */ 471 stb %o3, [%o0-1] 472 addcc %o2, 3, %o2 /* restore count */ 473 bz,pt %XCC, .Lsmallx 474.Lsmallleft3: /* 1, 2, or 3 bytes remain */ 475 subcc %o2, 1, %o2 476 ldub [%o1], %o3 /* load one byte */ 477 bz,pt %XCC, .Lsmallx 478 stb %o3, [%o0] /* store one byte */ 479 ldub [%o1+1], %o3 /* load second byte */ 480 subcc %o2, 1, %o2 481 bz,pt %XCC, .Lsmallx 482 stb %o3, [%o0+1] /* store second byte */ 483 ldub [%o1+2], %o3 /* load third byte */ 484 stb %o3, [%o0+2] /* store third byte */ 485.Lsmallx: 486 retl 487 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 488 489.Lsmallfin: 490 tst %o2 491 bnz,pn %XCC, .Lsmallleft3 492 nop 493 retl 494 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 495 496 .align 16 497.Lsmallwords: 498 lduw [%o1], %o3 /* read word */ 499 subcc %o2, 8, %o2 /* update count */ 500 stw %o3, [%o0] /* write word */ 501 add %o1, 8, %o1 /* update SRC */ 502 lduw [%o1-4], %o3 /* read word */ 503 add %o0, 8, %o0 /* update DST */ 504 bgu,pt %XCC, .Lsmallwords /* loop until done */ 505 stw %o3, [%o0-4] /* write word */ 506 addcc %o2, 7, %o2 /* restore count */ 507 bz,pt %XCC, .Lsmallexit /* check for completion */ 508 cmp %o2, 4 /* check for 4 or more bytes left */ 509 blt,pt %XCC, .Lsmallleft3 /* if not, go to finish up */ 510 nop 511 lduw [%o1], %o3 512 add %o1, 4, %o1 513 subcc %o2, 4, %o2 514 add %o0, 4, %o0 515 bnz,pn %XCC, .Lsmallleft3 516 stw %o3, [%o0-4] 517 retl 518 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 519 520 .align 16 521.Lmedium: 522.Lmedium_join: 523 neg %o0, %o5 524 andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ 525 brz,pt %o5, .Ldst_aligned_on_8 526 527 /* %o5 has the bytes to be written in partial store. */ 528 sub %o2, %o5, %o2 529 sub %o1, %o0, %o1 /* %o1 gets the difference */ 5307: /* dst aligning loop */ 531 ldub [%o1+%o0], %o4 /* load one byte */ 532 subcc %o5, 1, %o5 533 stb %o4, [%o0] 534 bgu,pt %XCC, 7b 535 add %o0, 1, %o0 /* advance dst */ 536 add %o1, %o0, %o1 /* restore %o1 */ 537.Ldst_aligned_on_8: 538 andcc %o1, 7, %o5 539 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 540 nop 541 542.Lsrc_dst_aligned_on_8: 543 /* check if we are copying MED_MAX or more bytes */ 544 cmp %o2, MED_MAX /* limit to store buffer size */ 545 bgu,pn %XCC, .Llarge_align8_copy 546 nop 547/* 548 * Special case for handling when src and dest are both long word aligned 549 * and total data to move is less than MED_MAX bytes 550 */ 551.Lmedlong: 552 subcc %o2, 63, %o2 /* adjust length to allow cc test */ 553 ble,pn %XCC, .Lmedl63 /* skip big loop if < 64 bytes */ 554 nop 555.Lmedl64: 556 ldx [%o1], %o4 /* load */ 557 subcc %o2, 64, %o2 /* decrement length count */ 558 stx %o4, [%o0] /* and store */ 559 ldx [%o1+8], %o3 /* a block of 64 bytes */ 560 stx %o3, [%o0+8] 561 ldx [%o1+16], %o4 562 stx %o4, [%o0+16] 563 ldx [%o1+24], %o3 564 stx %o3, [%o0+24] 565 ldx [%o1+32], %o4 /* load */ 566 stx %o4, [%o0+32] /* and store */ 567 ldx [%o1+40], %o3 /* a block of 64 bytes */ 568 add %o1, 64, %o1 /* increase src ptr by 64 */ 569 stx %o3, [%o0+40] 570 ldx [%o1-16], %o4 571 add %o0, 64, %o0 /* increase dst ptr by 64 */ 572 stx %o4, [%o0-16] 573 ldx [%o1-8], %o3 574 bgu,pt %XCC, .Lmedl64 /* repeat if at least 64 bytes left */ 575 stx %o3, [%o0-8] 576.Lmedl63: 577 addcc %o2, 32, %o2 /* adjust remaining count */ 578 ble,pt %XCC, .Lmedl31 /* to skip if 31 or fewer bytes left */ 579 nop 580 ldx [%o1], %o4 /* load */ 581 sub %o2, 32, %o2 /* decrement length count */ 582 stx %o4, [%o0] /* and store */ 583 ldx [%o1+8], %o3 /* a block of 32 bytes */ 584 add %o1, 32, %o1 /* increase src ptr by 32 */ 585 stx %o3, [%o0+8] 586 ldx [%o1-16], %o4 587 add %o0, 32, %o0 /* increase dst ptr by 32 */ 588 stx %o4, [%o0-16] 589 ldx [%o1-8], %o3 590 stx %o3, [%o0-8] 591.Lmedl31: 592 addcc %o2, 16, %o2 /* adjust remaining count */ 593 ble,pt %XCC, .Lmedl15 /* skip if 15 or fewer bytes left */ 594 nop 595 ldx [%o1], %o4 /* load and store 16 bytes */ 596 add %o1, 16, %o1 /* increase src ptr by 16 */ 597 stx %o4, [%o0] 598 sub %o2, 16, %o2 /* decrease count by 16 */ 599 ldx [%o1-8], %o3 600 add %o0, 16, %o0 /* increase dst ptr by 16 */ 601 stx %o3, [%o0-8] 602.Lmedl15: 603 addcc %o2, 15, %o2 /* restore count */ 604 bz,pt %XCC, .Lsmallexit /* exit if finished */ 605 cmp %o2, 8 606 blt,pt %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ 607 tst %o2 608 ldx [%o1], %o4 /* load 8 bytes */ 609 add %o1, 8, %o1 /* increase src ptr by 8 */ 610 add %o0, 8, %o0 /* increase dst ptr by 8 */ 611 subcc %o2, 8, %o2 /* decrease count by 8 */ 612 bnz,pn %XCC, .Lmedw7 613 stx %o4, [%o0-8] /* and store 8 bytes */ 614 retl 615 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 616 617 .align 16 618.Lsrc_dst_unaligned_on_8: 619 /* DST is 8-byte aligned, src is not */ 620 andcc %o1, 0x3, %o5 /* test word alignment */ 621 bnz,pt %XCC, .Lunalignsetup /* branch if not word aligned */ 622 nop 623 624/* 625 * Handle all cases where src and dest are aligned on word 626 * boundaries. Use unrolled loops for better performance. 627 * This option wins over standard large data move when 628 * source and destination is in cache for medium 629 * to short data moves. 630 */ 631 cmp %o2, MED_WMAX /* limit to store buffer size */ 632 bge,pt %XCC, .Lunalignrejoin /* otherwise rejoin main loop */ 633 nop 634 635 subcc %o2, 31, %o2 /* adjust length to allow cc test */ 636 /* for end of loop */ 637 ble,pt %XCC, .Lmedw31 /* skip big loop if less than 16 */ 638.Lmedw32: 639 ld [%o1], %o4 /* move a block of 32 bytes */ 640 sllx %o4, 32, %o5 641 ld [%o1+4], %o4 642 or %o4, %o5, %o5 643 stx %o5, [%o0] 644 subcc %o2, 32, %o2 /* decrement length count */ 645 ld [%o1+8], %o4 646 sllx %o4, 32, %o5 647 ld [%o1+12], %o4 648 or %o4, %o5, %o5 649 stx %o5, [%o0+8] 650 add %o1, 32, %o1 /* increase src ptr by 32 */ 651 ld [%o1-16], %o4 652 sllx %o4, 32, %o5 653 ld [%o1-12], %o4 654 or %o4, %o5, %o5 655 stx %o5, [%o0+16] 656 add %o0, 32, %o0 /* increase dst ptr by 32 */ 657 ld [%o1-8], %o4 658 sllx %o4, 32, %o5 659 ld [%o1-4], %o4 660 or %o4, %o5, %o5 661 bgu,pt %XCC, .Lmedw32 /* repeat if at least 32 bytes left */ 662 stx %o5, [%o0-8] 663.Lmedw31: 664 addcc %o2, 31, %o2 /* restore count */ 665 bz,pt %XCC, .Lsmallexit /* exit if finished */ 666 cmp %o2, 16 667 blt,pt %XCC, .Lmedw15 668 nop 669 ld [%o1], %o4 /* move a block of 16 bytes */ 670 sllx %o4, 32, %o5 671 subcc %o2, 16, %o2 /* decrement length count */ 672 ld [%o1+4], %o4 673 or %o4, %o5, %o5 674 stx %o5, [%o0] 675 add %o1, 16, %o1 /* increase src ptr by 16 */ 676 ld [%o1-8], %o4 677 add %o0, 16, %o0 /* increase dst ptr by 16 */ 678 sllx %o4, 32, %o5 679 ld [%o1-4], %o4 680 or %o4, %o5, %o5 681 stx %o5, [%o0-8] 682.Lmedw15: 683 bz,pt %XCC, .Lsmallexit /* exit if finished */ 684 cmp %o2, 8 685 blt,pn %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ 686 tst %o2 687 ld [%o1], %o4 /* load 4 bytes */ 688 subcc %o2, 8, %o2 /* decrease count by 8 */ 689 stw %o4, [%o0] /* and store 4 bytes */ 690 add %o1, 8, %o1 /* increase src ptr by 8 */ 691 ld [%o1-4], %o3 /* load 4 bytes */ 692 add %o0, 8, %o0 /* increase dst ptr by 8 */ 693 stw %o3, [%o0-4] /* and store 4 bytes */ 694 bz,pt %XCC, .Lsmallexit /* exit if finished */ 695.Lmedw7: /* count is ge 1, less than 8 */ 696 cmp %o2, 4 /* check for 4 bytes left */ 697 blt,pn %XCC, .Lsmallleft3 /* skip if 3 or fewer bytes left */ 698 nop 699 ld [%o1], %o4 /* load 4 bytes */ 700 add %o1, 4, %o1 /* increase src ptr by 4 */ 701 add %o0, 4, %o0 /* increase dst ptr by 4 */ 702 subcc %o2, 4, %o2 /* decrease count by 4 */ 703 bnz,pt %XCC, .Lsmallleft3 704 stw %o4, [%o0-4] /* and store 4 bytes */ 705 retl 706 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 707 708 .align 16 709.Llarge_align8_copy: /* Src and dst 8 byte aligned */ 710 /* align dst to 64 byte boundary */ 711 andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ 712 brz,pn %o3, .Laligned_to_64 713 andcc %o0, 8, %o3 /* odd long words to move? */ 714 brz,pt %o3, .Laligned_to_16 715 nop 716 ldx [%o1], %o4 717 sub %o2, 8, %o2 718 add %o1, 8, %o1 /* increment src ptr */ 719 add %o0, 8, %o0 /* increment dst ptr */ 720 stx %o4, [%o0-8] 721.Laligned_to_16: 722 andcc %o0, 16, %o3 /* pair of long words to move? */ 723 brz,pt %o3, .Laligned_to_32 724 nop 725 ldx [%o1], %o4 726 sub %o2, 16, %o2 727 stx %o4, [%o0] 728 add %o1, 16, %o1 /* increment src ptr */ 729 ldx [%o1-8], %o4 730 add %o0, 16, %o0 /* increment dst ptr */ 731 stx %o4, [%o0-8] 732.Laligned_to_32: 733 andcc %o0, 32, %o3 /* four long words to move? */ 734 brz,pt %o3, .Laligned_to_64 735 nop 736 ldx [%o1], %o4 737 sub %o2, 32, %o2 738 stx %o4, [%o0] 739 ldx [%o1+8], %o4 740 stx %o4, [%o0+8] 741 ldx [%o1+16], %o4 742 stx %o4, [%o0+16] 743 add %o1, 32, %o1 /* increment src ptr */ 744 ldx [%o1-8], %o4 745 add %o0, 32, %o0 /* increment dst ptr */ 746 stx %o4, [%o0-8] 747.Laligned_to_64: 748/* Following test is included to avoid issues where existing executables 749 * incorrectly call memcpy with overlapping src and dest instead of memmove 750 * 751 * if ( (src ge dst) and (dst+len > src)) go to overlap case 752 * if ( (src lt dst) and (src+len > dst)) go to overlap case 753 */ 754 cmp %o1,%o0 755 bge,pt %XCC, 1f 756 nop 757/* src+len > dst? */ 758 add %o1, %o2, %o4 759 cmp %o4, %o0 760 bgt,pt %XCC, .Lmv_aligned_on_64 761 nop 762 ba 2f 763 nop 7641: 765/* dst+len > src? */ 766 add %o0, %o2, %o4 767 cmp %o4, %o1 768 bgt,pt %XCC, .Lmv_aligned_on_64 769 nop 7702: 771/* handle non-overlapped copies 772 * 773 * Using block init store (BIS) instructions to avoid fetching cache 774 * lines from memory. Use ST_CHUNK stores to first element of each cache 775 * line (similar to prefetching) to avoid overfilling STQ or miss buffers. 776 * Gives existing cache lines time to be moved out of L1/L2/L3 cache. 777 */ 778 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ 779 and %o2, 0x3f, %o2 /* residue bytes in %o2 */ 780 781/* We use ASI_STBIMRU_P for the first store to each cache line 782 * followed by ASI_STBI_P (mark as LRU) for the last store. That 783 * mixed approach reduces the chances the cache line is removed 784 * before we finish setting it, while minimizing the effects on 785 * other cached values during a large memcpy 786 * 787 * Intermediate stores can be normal since first BIS activates the 788 * cache line in the L2 cache. 789 * 790 * ST_CHUNK batches up initial BIS operations for several cache lines 791 * to allow multiple requests to not be blocked by overflowing the 792 * the store miss buffer. Then the matching stores for all those 793 * BIS operations are executed. 794 */ 795 796.Lalign_loop: 797 cmp %o5, ST_CHUNK*64 798 blu,pt %XCC, .Lalign_short 799 mov ST_CHUNK, %o3 800 sllx %o3, 6, %g5 /* ST_CHUNK*64 */ 801 802.Lalign_loop_start: 803 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 804 subcc %o3, 2, %o3 805 ldx [%o1], %o4 806 add %o1, 128, %o1 807 EX_ST(STORE_ASI(%o4, %o0)) 808 add %o0, 64, %o0 809 ldx [%o1-64], %o4 810 EX_ST(STORE_ASI(%o4, %o0)) 811 add %o0, 64, %o0 812 bgu,pt %XCC, .Lalign_loop_start 813 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21 814 815 mov ST_CHUNK, %o3 816 sub %o1, %g5, %o1 /* reset %o1 */ 817 sub %o0, %g5, %o0 /* reset %o0 */ 818 819 sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ 820.Lalign_loop_rest: 821 ldx [%o1+8],%o4 822 add %o0, 64, %o0 823 stx %o4, [%o0-48] 824 subcc %o3, 1, %o3 825 ldx [%o1+16],%o4 826 stx %o4, [%o0-40] 827 sub %o5, 64, %o5 828 ldx [%o1+24],%o4 829 stx %o4, [%o0-32] 830 ldx [%o1+32],%o4 831 stx %o4, [%o0-24] 832 ldx [%o1+40],%o4 833 stx %o4, [%o0-16] 834 ldx [%o1+48],%o4 835 stx %o4, [%o0-8] 836 add %o1, 64, %o1 837 ldx [%o1-8],%o4 838 bgu,pt %XCC, .Lalign_loop_rest 839 EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ 840 841 mov ST_CHUNK, %o3 842 cmp %o5, ST_CHUNK*64 843 bgu,pt %XCC, .Lalign_loop_start 844 add %o0, 8, %o0 /* restore %o0 from ASI alignment */ 845 846 cmp %o5, 0 847 beq,pt %XCC, .Lalign_done 848 849/* no prefetches needed in these loops 850 * since we are within ALIGN_PRE of the end */ 851.Lalign_short: 852 srl %o5, 6, %o3 853.Lalign_loop_short: 854 subcc %o3, 1, %o3 855 ldx [%o1], %o4 856 add %o1, 64, %o1 857 EX_ST(STORE_ASI(%o4, %o0)) 858 bgu,pt %XCC, .Lalign_loop_short 859 add %o0, 64, %o0 860 861 sub %o1, %o5, %o1 /* reset %o1 */ 862 sub %o0, %o5, %o0 /* reset %o0 */ 863 864 sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ 865.Lalign_short_rest: 866 ldx [%o1+8],%o4 867 add %o0, 64, %o0 868 stx %o4, [%o0-48] 869 ldx [%o1+16],%o4 870 subcc %o5, 64, %o5 871 stx %o4, [%o0-40] 872 ldx [%o1+24],%o4 873 stx %o4, [%o0-32] 874 ldx [%o1+32],%o4 875 stx %o4, [%o0-24] 876 ldx [%o1+40],%o4 877 stx %o4, [%o0-16] 878 ldx [%o1+48],%o4 879 stx %o4, [%o0-8] 880 add %o1, 64, %o1 881 ldx [%o1-8],%o4 882 bgu,pt %XCC, .Lalign_short_rest 883 EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ 884 885 add %o0, 8, %o0 /* restore %o0 from ASI alignment */ 886 887.Lalign_done: 888 cmp %o2, 0 889 membar #StoreStore 890 bne,pt %XCC, .Lmedl63 891 subcc %o2, 63, %o2 /* adjust length to allow cc test */ 892 retl 893 mov EX_RETVAL(%g1), %o0 /* restore %o0 */ 894 895 .align 16 896 /* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX */ 897 /* Since block load/store and BIS are not in use for unaligned data, 898 * no need to align dst on 64 byte cache line boundary */ 899.Lunalignsetup: 900.Lunalignrejoin: 901 rd %fprs, %g5 /* check for unused fp */ 902 /* if fprs.fef == 0, set it. 903 * Setting it when already set costs more than checking */ 904 andcc %g5, FPRS_FEF, %g5 /* test FEF, fprs.du = fprs.dl = 0 */ 905 bz,a %XCC, 1f 906 wr %g0, FPRS_FEF, %fprs /* fprs.fef = 1 */ 9071: 908 andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ 909 and %o2, 0x3f, %o2 /* residue bytes in %o2 */ 910 cmp %o2, 8 /* Insure we do not load beyond */ 911 bgt,pt %XCC, .Lunalign_adjust /* end of source buffer */ 912 andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ 913 add %o2, 64, %o2 /* adjust to leave loop */ 914 sub %o5, 64, %o5 /* early if necessary */ 915.Lunalign_adjust: 916 alignaddr %o1, %g0, %g0 /* generate %gsr */ 917 add %o1, %o5, %o1 /* advance %o1 to after blocks */ 918 ldd [%o4], %f0 919.Lunalign_loop: 920 prefetch [%o0 + (9 * BLOCK_SIZE)], 20 921 ldd [%o4+8], %f2 922 faligndata %f0, %f2, %f16 923 ldd [%o4+16], %f4 924 subcc %o5, BLOCK_SIZE, %o5 925 std %f16, [%o0] 926 faligndata %f2, %f4, %f18 927 ldd [%o4+24], %f6 928 std %f18, [%o0+8] 929 faligndata %f4, %f6, %f20 930 ldd [%o4+32], %f8 931 std %f20, [%o0+16] 932 faligndata %f6, %f8, %f22 933 ldd [%o4+40], %f10 934 std %f22, [%o0+24] 935 faligndata %f8, %f10, %f24 936 ldd [%o4+48], %f12 937 std %f24, [%o0+32] 938 faligndata %f10, %f12, %f26 939 ldd [%o4+56], %f14 940 add %o4, BLOCK_SIZE, %o4 941 std %f26, [%o0+40] 942 faligndata %f12, %f14, %f28 943 ldd [%o4], %f0 944 std %f28, [%o0+48] 945 faligndata %f14, %f0, %f30 946 std %f30, [%o0+56] 947 add %o0, BLOCK_SIZE, %o0 948 bgu,pt %XCC, .Lunalign_loop 949 prefetch [%o4 + (11 * BLOCK_SIZE)], 20 950 951 /* Handle trailing bytes, 64 to 127 952 * Dest long word aligned, Src not long word aligned */ 953 cmp %o2, 15 954 bleu,pt %XCC, .Lunalign_short 955 956 andn %o2, 0x7, %o5 /* %o5 is multiple of 8 */ 957 and %o2, 0x7, %o2 /* residue bytes in %o2 */ 958 add %o2, 8, %o2 959 sub %o5, 8, %o5 /* do not load past end of src */ 960 andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ 961 add %o1, %o5, %o1 /* move %o1 to after multiple of 8 */ 962 ldd [%o4], %f0 /* fetch partial word */ 963.Lunalign_by8: 964 ldd [%o4+8], %f2 965 add %o4, 8, %o4 966 faligndata %f0, %f2, %f16 967 subcc %o5, 8, %o5 968 std %f16, [%o0] 969 fsrc2 %f2, %f0 970 bgu,pt %XCC, .Lunalign_by8 971 add %o0, 8, %o0 972 973.Lunalign_short: /* restore fprs state */ 974 brnz,pt %g5, .Lsmallrest 975 nop 976 ba .Lsmallrest 977 wr %g5, %g0, %fprs 978END(__memcpy_niagara7) 979 980#endif 981