1/* Optimized version of the standard memcpy() function. 2 This file is part of the GNU C Library. 3 Copyright (C) 2000-2022 Free Software Foundation, Inc. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* Return: dest 20 21 Inputs: 22 in0: dest 23 in1: src 24 in2: byte count 25 26 An assembly implementation of the algorithm used by the generic C 27 version from glibc. The case when source and sest are aligned is 28 treated separately, for extra performance. 29 30 In this form, memcpy assumes little endian mode. For big endian mode, 31 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 32 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the 33 shrp instruction. */ 34 35#define USE_LFETCH 36#define USE_FLP 37#include <sysdep.h> 38#undef ret 39 40#define LFETCH_DIST 500 41 42#define ALIGN_UNROLL_no 4 // no. of elements 43#define ALIGN_UNROLL_sh 2 // (shift amount) 44 45#define MEMLAT 8 46#define Nrot ((4*(MEMLAT+2) + 7) & ~7) 47 48#define OP_T_THRES 16 49#define OPSIZ 8 50 51#define loopcnt r14 52#define elemcnt r15 53#define saved_pr r16 54#define saved_lc r17 55#define adest r18 56#define dest r19 57#define asrc r20 58#define src r21 59#define len r22 60#define tmp2 r23 61#define tmp3 r24 62#define tmp4 r25 63#define ptable r26 64#define ploop56 r27 65#define loopaddr r28 66#define sh1 r29 67#define ptr1 r30 68#define ptr2 r31 69 70#define movi0 mov 71 72#define p_scr p6 73#define p_xtr p7 74#define p_nxtr p8 75#define p_few p9 76 77#if defined(USE_FLP) 78#define load ldf8 79#define store stf8 80#define tempreg f6 81#define the_r fr 82#define the_s fs 83#define the_t ft 84#define the_q fq 85#define the_w fw 86#define the_x fx 87#define the_y fy 88#define the_z fz 89#elif defined(USE_INT) 90#define load ld8 91#define store st8 92#define tempreg tmp2 93#define the_r r 94#define the_s s 95#define the_t t 96#define the_q q 97#define the_w w 98#define the_x x 99#define the_y y 100#define the_z z 101#endif 102 103#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO 104/* Manually force proper loop-alignment. Note: be sure to 105 double-check the code-layout after making any changes to 106 this routine! */ 107# define ALIGN(n) { nop 0 } 108#else 109# define ALIGN(n) .align n 110#endif 111 112#if defined(USE_LFETCH) 113#define LOOP(shift) \ 114 ALIGN(32); \ 115.loop##shift##: \ 116{ .mmb \ 117(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ 118(p[0]) lfetch.nt1 [ptr1], 16 ; \ 119 nop.b 0 ; \ 120} { .mib \ 121(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ 122(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ 123 nop.b 0 ;; \ 124 } { .mmb \ 125(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ 126(p[0]) lfetch.nt1 [ptr2], 16 ; \ 127 nop.b 0 ; \ 128} { .mib \ 129(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ 130(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ 131 br.ctop.sptk.many .loop##shift \ 132;; } \ 133{ .mib \ 134 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ 135} 136#else 137#define LOOP(shift) \ 138 ALIGN(32); \ 139.loop##shift##: \ 140{ .mmb \ 141(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ 142 nop.b 0 ; \ 143} { .mib \ 144(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ 145(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ 146 nop.b 0 ;; \ 147 } { .mmb \ 148(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ 149 nop.b 0 ; \ 150} { .mib \ 151(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ 152(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ 153 br.ctop.sptk.many .loop##shift \ 154;; } \ 155{ .mib \ 156 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ 157} 158#endif 159 160 161ENTRY(memcpy) 162{ .mmi 163 .prologue 164 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot 165 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1] 166 .rotp p[MEMLAT+2] 167 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1] 168 mov ret0 = in0 // return tmp2 = dest 169 .save pr, saved_pr 170 movi0 saved_pr = pr // save the predicate registers 171} { .mmi 172 and tmp4 = 7, in0 // check if destination is aligned 173 mov dest = in0 // dest 174 mov src = in1 // src 175;; } 176{ .mii 177 cmp.eq p_scr, p0 = in2, r0 // if (len == 0) 178 .save ar.lc, saved_lc 179 movi0 saved_lc = ar.lc // save the loop counter 180 .body 181 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH 182} { .mbb 183 mov len = in2 // len 184(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest 185(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte 186;; } 187{ .mmi 188#if defined(USE_LFETCH) 189 lfetch.nt1 [dest] // 190 lfetch.nt1 [src] // 191#endif 192 shr.u elemcnt = len, 3 // elemcnt = len / 8 193} { .mib 194 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned? 195 sub loopcnt = 7, tmp4 // 196(p_scr) br.cond.dptk.many .dest_aligned 197;; } 198{ .mmi 199 ld1 tmp2 = [src], 1 // 200 sub len = len, loopcnt, 1 // reduce len 201 movi0 ar.lc = loopcnt // 202} { .mib 203 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point 204;; } 205 206.l0: // ---------------------------- // L0: Align src on 8-byte boundary 207{ .mmi 208 st1 [dest] = tmp2, 1 // 209(p_scr) ld1 tmp2 = [src], 1 // 210} { .mib 211 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point 212 add loopcnt = -1, loopcnt 213 br.cloop.dptk.few .l0 // 214;; } 215 216.dest_aligned: 217{ .mmi 218 and tmp4 = 7, src // ready for alignment check 219 shr.u elemcnt = len, 3 // elemcnt = len / 8 220;; } 221{ .mib 222 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned 223 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src 224} { .mib // is not 16B aligned 225 add ptr2 = LFETCH_DIST, dest // prefetch address 226 add ptr1 = LFETCH_DIST, src 227(p_scr) br.cond.dptk.many .src_not_aligned 228;; } 229 230// The optimal case, when dest, and src are aligned 231 232.both_aligned: 233{ .mmi 234 .pred.rel "mutex",p_xtr,p_nxtr 235(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify 236(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify 237 movi0 pr.rot = 1 << 16 // set rotating predicates 238} { .mib 239(p_scr) br.cond.dpnt.many .copy_full_words 240;; } 241 242{ .mmi 243(p_xtr) load tempreg = [src], 8 244(p_xtr) add elemcnt = -1, elemcnt 245 movi0 ar.ec = MEMLAT + 1 // set the epilog counter 246;; } 247{ .mmi 248(p_xtr) add len = -8, len // 249 add asrc = 16, src // one bank apart (for USE_INT) 250 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling 251;;} 252{ .mmi 253 add loopcnt = -1, loopcnt 254(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word 255 nop.i 0 256;; } 257{ .mib 258 add adest = 16, dest 259 movi0 ar.lc = loopcnt // set the loop counter 260;; } 261 262#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO 263 { nop 0 } 264#else 265 .align 32 266#endif 267#if defined(USE_FLP) 268.l1: // ------------------------------- // L1: Everything a multiple of 8 269{ .mmi 270#if defined(USE_LFETCH) 271(p[0]) lfetch.nt1 [ptr2],32 272#endif 273(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16 274(p[0]) add len = -32, len 275} {.mmb 276(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 277(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8 278;; } 279{ .mmi 280#if defined(USE_LFETCH) 281(p[0]) lfetch.nt1 [ptr1],32 282#endif 283(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16 284} {.mmb 285(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24 286(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 287 br.ctop.dptk.many .l1 288;; } 289#elif defined(USE_INT) 290.l1: // ------------------------------- // L1: Everything a multiple of 8 291{ .mmi 292(p[0]) load the_r[0] = [src], 8 293(p[0]) load the_q[0] = [asrc], 8 294(p[0]) add len = -32, len 295} {.mmb 296(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 297(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8 298;; } 299{ .mmi 300(p[0]) load the_s[0] = [src], 24 301(p[0]) load the_t[0] = [asrc], 24 302} {.mmb 303(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24 304(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 305#if defined(USE_LFETCH) 306;; } 307{ .mmb 308(p[0]) lfetch.nt1 [ptr2],32 309(p[0]) lfetch.nt1 [ptr1],32 310#endif 311 br.ctop.dptk.many .l1 312;; } 313#endif 314 315.copy_full_words: 316{ .mib 317 cmp.gt p_scr, p0 = 8, len // 318 shr.u elemcnt = len, 3 // 319(p_scr) br.cond.dpnt.many .copy_bytes 320;; } 321{ .mii 322 load tempreg = [src], 8 323 add loopcnt = -1, elemcnt // 324;; } 325{ .mii 326 cmp.ne p_scr, p0 = 0, loopcnt // 327 mov ar.lc = loopcnt // 328;; } 329 330.l2: // ------------------------------- // L2: Max 4 words copied separately 331{ .mmi 332 store [dest] = tempreg, 8 333(p_scr) load tempreg = [src], 8 // 334 add len = -8, len 335} { .mib 336 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point 337 add loopcnt = -1, loopcnt 338 br.cloop.dptk.few .l2 339;; } 340 341.copy_bytes: 342{ .mib 343 cmp.eq p_scr, p0 = len, r0 // is len == 0 ? 344 add loopcnt = -1, len // len--; 345(p_scr) br.cond.spnt .restore_and_exit 346;; } 347{ .mii 348 ld1 tmp2 = [src], 1 349 movi0 ar.lc = loopcnt 350 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point 351;; } 352 353.l3: // ------------------------------- // L3: Final byte move 354{ .mmi 355 st1 [dest] = tmp2, 1 356(p_scr) ld1 tmp2 = [src], 1 357} { .mib 358 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point 359 add loopcnt = -1, loopcnt 360 br.cloop.dptk.few .l3 361;; } 362 363.restore_and_exit: 364{ .mmi 365 movi0 pr = saved_pr, -1 // restore the predicate registers 366;; } 367{ .mib 368 movi0 ar.lc = saved_lc // restore the loop counter 369 br.ret.sptk.many b0 370;; } 371 372 373.src_not_aligned: 374{ .mmi 375 cmp.gt p_scr, p0 = 16, len 376 and sh1 = 7, src // sh1 = src % 8 377 shr.u loopcnt = len, 4 // element-cnt = len / 16 378} { .mib 379 add tmp4 = @ltoff(.table), gp 380 add tmp3 = @ltoff(.loop56), gp 381(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few 382;; } 383{ .mmi 384 and asrc = -8, src // asrc = (-8) -- align src for loop 385 add loopcnt = -1, loopcnt // loopcnt-- 386 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) 387} { .mmi 388 ld8 ptable = [tmp4] // ptable = &table 389 ld8 ploop56 = [tmp3] // ploop56 = &loop56 390 and tmp2 = -16, len // tmp2 = len & -OPSIZ 391;; } 392{ .mmi 393 add tmp3 = ptable, sh1 // tmp3 = &table + sh1 394 add src = src, tmp2 // src += len & (-16) 395 movi0 ar.lc = loopcnt // set LC 396;; } 397{ .mmi 398 ld8 tmp4 = [tmp3] // tmp4 = loop offset 399 sub len = len, tmp2 // len -= len & (-16) 400 movi0 ar.ec = MEMLAT + 2 // one more pass needed 401;; } 402{ .mmi 403 ld8 s[1] = [asrc], 8 // preload 404 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset 405 movi0 pr.rot = 1 << 16 // set rotating predicates 406;; } 407{ .mib 408 nop.m 0 409 movi0 b6 = loopaddr 410 br b6 // jump to the appropriate loop 411;; } 412 413 LOOP(8) 414 LOOP(16) 415 LOOP(24) 416 LOOP(32) 417 LOOP(40) 418 LOOP(48) 419 LOOP(56) 420END(memcpy) 421libc_hidden_builtin_def (memcpy) 422 423 .rodata 424 .align 8 425.table: 426 data8 0 // dummy entry 427 data8 .loop56 - .loop8 428 data8 .loop56 - .loop16 429 data8 .loop56 - .loop24 430 data8 .loop56 - .loop32 431 data8 .loop56 - .loop40 432 data8 .loop56 - .loop48 433 data8 .loop56 - .loop56 434