1/* Optimized version of the standard memmove() function. 2 This file is part of the GNU C Library. 3 Copyright (C) 2000-2022 Free Software Foundation, Inc. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* Return: dest 20 21 Inputs: 22 in0: dest 23 in1: src 24 in2: byte count 25 26 The core of the function is the memcpy implementation used in memcpy.S. 27 When bytes have to be copied backwards, only the easy case, when 28 all arguments are multiples of 8, is optimised. 29 30 In this form, it assumes little endian mode. For big endian mode, 31 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 32 or the UM.be bit should be cleared at the beginning and set at the end. */ 33 34#include <sysdep.h> 35#undef ret 36 37#define OP_T_THRES 16 38#define OPSIZ 8 39 40#define adest r15 41#define saved_pr r17 42#define saved_lc r18 43#define dest r19 44#define src r20 45#define len r21 46#define asrc r22 47#define tmp2 r23 48#define tmp3 r24 49#define tmp4 r25 50#define ptable r26 51#define ploop56 r27 52#define loopaddr r28 53#define sh1 r29 54#define loopcnt r30 55#define value r31 56 57#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO 58# define ALIGN(n) { nop 0 } 59#else 60# define ALIGN(n) .align n 61#endif 62 63#define LOOP(shift) \ 64 ALIGN(32); \ 65.loop##shift##: \ 66(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \ 67(p[MEMLAT+1]) st8 [dest] = value, 8 ; \ 68(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \ 69 nop.b 0 ; \ 70 nop.b 0 ; \ 71 br.ctop.sptk .loop##shift ; \ 72 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */ 73 74#define MEMLAT 21 75#define Nrot (((2*MEMLAT+3) + 7) & ~7) 76 77ENTRY(memmove) 78 .prologue 79 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot 80 .rotr r[MEMLAT + 2], q[MEMLAT + 1] 81 .rotp p[MEMLAT + 2] 82 mov ret0 = in0 // return value = dest 83 .save pr, saved_pr 84 mov saved_pr = pr // save the predicate registers 85 .save ar.lc, saved_lc 86 mov saved_lc = ar.lc // save the loop counter 87 .body 88 or tmp3 = in0, in1 ;; // tmp3 = dest | src 89 or tmp3 = tmp3, in2 // tmp3 = dest | src | len 90 mov dest = in0 // dest 91 mov src = in1 // src 92 mov len = in2 // len 93 sub tmp2 = r0, in0 // tmp2 = -dest 94 cmp.eq p6, p0 = in2, r0 // if (len == 0) 95(p6) br.cond.spnt .restore_and_exit;;// return dest; 96 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7 97 cmp.le p6, p0 = dest, src // if dest <= src it's always safe 98(p6) br.cond.spnt .forward // to copy forward 99 add tmp3 = src, len;; 100 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len 101(p6) br.cond.spnt .backward // we have to copy backward 102 103.forward: 104 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16 105 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0) 106(p6) br.cond.sptk .next // goto next; 107 108// The optimal case, when dest, src and len are all multiples of 8 109 110 and tmp3 = 0xf, len 111 mov pr.rot = 1 << 16 // set rotating predicates 112 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter 113 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word? 114 adds loopcnt = -1, loopcnt;; // --loopcnt 115(p6) ld8 value = [src], 8;; 116(p6) st8 [dest] = value, 8 // copy the "odd" word 117 mov ar.lc = loopcnt // set the loop counter 118 cmp.eq p6, p0 = 8, len 119(p6) br.cond.spnt .restore_and_exit;;// the one-word special case 120 adds adest = 8, dest // set adest one word ahead of dest 121 adds asrc = 8, src ;; // set asrc one word ahead of src 122 nop.b 0 // get the "golden" alignment for 123 nop.b 0 // the next loop 124.l0: 125(p[0]) ld8 r[0] = [src], 16 126(p[0]) ld8 q[0] = [asrc], 16 127(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16 128(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16 129 br.ctop.dptk .l0 ;; 130 131 mov pr = saved_pr, -1 // restore the predicate registers 132 mov ar.lc = saved_lc // restore the loop counter 133 br.ret.sptk.many b0 134.next: 135 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES 136 and loopcnt = 7, tmp2 // loopcnt = -dest % 8 137(p6) br.cond.spnt .cpyfew // copy byte by byte 138 ;; 139 cmp.eq p6, p0 = loopcnt, r0 140(p6) br.cond.sptk .dest_aligned 141 sub len = len, loopcnt // len -= -dest % 8 142 adds loopcnt = -1, loopcnt // --loopcnt 143 ;; 144 mov ar.lc = loopcnt 145.l1: // copy -dest % 8 bytes 146 ld1 value = [src], 1 // value = *src++ 147 ;; 148 st1 [dest] = value, 1 // *dest++ = value 149 br.cloop.dptk .l1 150.dest_aligned: 151 and sh1 = 7, src // sh1 = src % 8 152 and tmp2 = -8, len // tmp2 = len & -OPSIZ 153 and asrc = -8, src // asrc = src & -OPSIZ -- align src 154 shr.u loopcnt = len, 3 // loopcnt = len / 8 155 and len = 7, len;; // len = len % 8 156 adds loopcnt = -1, loopcnt // --loopcnt 157 addl tmp4 = @ltoff(.table), gp 158 addl tmp3 = @ltoff(.loop56), gp 159 mov ar.ec = MEMLAT + 1 // set EC 160 mov pr.rot = 1 << 16;; // set rotating predicates 161 mov ar.lc = loopcnt // set LC 162 cmp.eq p6, p0 = sh1, r0 // is the src aligned? 163(p6) br.cond.sptk .src_aligned 164 add src = src, tmp2 // src += len & -OPSIZ 165 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) 166 ld8 ploop56 = [tmp3] // ploop56 = &loop56 167 ld8 ptable = [tmp4];; // ptable = &table 168 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1 169 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed 170 ld8 tmp4 = [tmp3];; // tmp4 = loop offset 171 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset 172 ld8 r[1] = [asrc], 8;; // w0 173 mov b6 = loopaddr;; 174 br b6 // jump to the appropriate loop 175 176 LOOP(8) 177 LOOP(16) 178 LOOP(24) 179 LOOP(32) 180 LOOP(40) 181 LOOP(48) 182 LOOP(56) 183 184.src_aligned: 185.l3: 186(p[0]) ld8 r[0] = [src], 8 187(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8 188 br.ctop.dptk .l3 189.cpyfew: 190 cmp.eq p6, p0 = len, r0 // is len == 0 ? 191 adds len = -1, len // --len; 192(p6) br.cond.spnt .restore_and_exit ;; 193 mov ar.lc = len 194.l4: 195 ld1 value = [src], 1 196 ;; 197 st1 [dest] = value, 1 198 br.cloop.dptk .l4 ;; 199.restore_and_exit: 200 mov pr = saved_pr, -1 // restore the predicate registers 201 mov ar.lc = saved_lc // restore the loop counter 202 br.ret.sptk.many b0 203 204// In the case of a backward copy, optimise only the case when everything 205// is a multiple of 8, otherwise copy byte by byte. The backward copy is 206// used only when the blocks are overlapping and dest > src. 207 208.backward: 209 shr.u loopcnt = len, 3 // loopcnt = len / 8 210 add src = src, len // src points one byte past the end 211 add dest = dest, len ;; // dest points one byte past the end 212 mov ar.ec = MEMLAT + 1 // set the epilog counter 213 mov pr.rot = 1 << 16 // set rotating predicates 214 adds loopcnt = -1, loopcnt // --loopcnt 215 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0) 216(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward 217 adds src = -8, src // src points to the last word 218 adds dest = -8, dest // dest points to the last word 219 mov ar.lc = loopcnt;; // set the loop counter 220.l5: 221(p[0]) ld8 r[0] = [src], -8 222(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8 223 br.ctop.dptk .l5 224 br.cond.sptk .restore_and_exit 225.bytecopy: 226 adds src = -1, src // src points to the last byte 227 adds dest = -1, dest // dest points to the last byte 228 adds loopcnt = -1, len;; // loopcnt = len - 1 229 mov ar.lc = loopcnt;; // set the loop counter 230.l6: 231(p[0]) ld1 r[0] = [src], -1 232(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1 233 br.ctop.dptk .l6 234 br.cond.sptk .restore_and_exit 235END(memmove) 236 237 .rodata 238 .align 8 239.table: 240 data8 0 // dummy entry 241 data8 .loop56 - .loop8 242 data8 .loop56 - .loop16 243 data8 .loop56 - .loop24 244 data8 .loop56 - .loop32 245 data8 .loop56 - .loop40 246 data8 .loop56 - .loop48 247 data8 .loop56 - .loop56 248 249libc_hidden_builtin_def (memmove) 250