1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 10 * Copyright (C) 2002 Broadcom, Inc. 11 * memcpy/copy_user author: Mark Vandevoorde 12 * 13 * Mnemonic names for arguments to memcpy/__copy_user 14 */ 15 16#include <asm/asm.h> 17#include <asm/asm-offsets.h> 18#include <asm/regdef.h> 19 20#define dst a0 21#define src a1 22#define len a2 23 24/* 25 * Spec 26 * 27 * memcpy copies len bytes from src to dst and sets v0 to dst. 28 * It assumes that 29 * - src and dst don't overlap 30 * - src is readable 31 * - dst is writable 32 * memcpy uses the standard calling convention 33 * 34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 35 * the number of uncopied bytes due to an exception caused by a read or write. 36 * __copy_user assumes that src and dst don't overlap, and that the call is 37 * implementing one of the following: 38 * copy_to_user 39 * - src is readable (no exceptions when reading src) 40 * copy_from_user 41 * - dst is writable (no exceptions when writing dst) 42 * __copy_user uses a non-standard calling convention; see 43 * arch/mips/include/asm/uaccess.h 44 * 45 * When an exception happens on a load, the handler must 46 # ensure that all of the destination buffer is overwritten to prevent 47 * leaking information to user mode programs. 48 */ 49 50/* 51 * Implementation 52 */ 53 54/* 55 * The exception handler for loads requires that: 56 * 1- AT contain the address of the byte just past the end of the source 57 * of the copy, 58 * 2- src_entry <= src < AT, and 59 * 3- (dst - src) == (dst_entry - src_entry), 60 * The _entry suffix denotes values when __copy_user was called. 61 * 62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 63 * (2) is met by incrementing src by the number of bytes copied 64 * (3) is met by not doing loads between a pair of increments of dst and src 65 * 66 * The exception handlers for stores adjust len (if necessary) and return. 67 * These handlers do not need to overwrite any data. 68 * 69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 70 * they're not protected. 71 */ 72 73#define EXC(inst_reg,addr,handler) \ 749: inst_reg, addr; \ 75 .section __ex_table,"a"; \ 76 PTR 9b, handler; \ 77 .previous 78 79/* 80 * Only on the 64-bit kernel we can made use of 64-bit registers. 81 */ 82#ifdef CONFIG_64BIT 83#define USE_DOUBLE 84#endif 85 86#ifdef USE_DOUBLE 87 88#define LOAD ld 89#define LOADL ldl 90#define LOADR ldr 91#define STOREL sdl 92#define STORER sdr 93#define STORE sd 94#define ADD daddu 95#define SUB dsubu 96#define SRL dsrl 97#define SRA dsra 98#define SLL dsll 99#define SLLV dsllv 100#define SRLV dsrlv 101#define NBYTES 8 102#define LOG_NBYTES 3 103 104/* 105 * As we are sharing code base with the mips32 tree (which use the o32 ABI 106 * register definitions). We need to redefine the register definitions from 107 * the n64 ABI register naming to the o32 ABI register naming. 108 */ 109#undef t0 110#undef t1 111#undef t2 112#undef t3 113#define t0 $8 114#define t1 $9 115#define t2 $10 116#define t3 $11 117#define t4 $12 118#define t5 $13 119#define t6 $14 120#define t7 $15 121 122#else 123 124#define LOAD lw 125#define LOADL lwl 126#define LOADR lwr 127#define STOREL swl 128#define STORER swr 129#define STORE sw 130#define ADD addu 131#define SUB subu 132#define SRL srl 133#define SLL sll 134#define SRA sra 135#define SLLV sllv 136#define SRLV srlv 137#define NBYTES 4 138#define LOG_NBYTES 2 139 140#endif /* USE_DOUBLE */ 141 142#ifdef CONFIG_CPU_LITTLE_ENDIAN 143#define LDFIRST LOADR 144#define LDREST LOADL 145#define STFIRST STORER 146#define STREST STOREL 147#define SHIFT_DISCARD SLLV 148#else 149#define LDFIRST LOADL 150#define LDREST LOADR 151#define STFIRST STOREL 152#define STREST STORER 153#define SHIFT_DISCARD SRLV 154#endif 155 156#define FIRST(unit) ((unit)*NBYTES) 157#define REST(unit) (FIRST(unit)+NBYTES-1) 158#define UNIT(unit) FIRST(unit) 159 160#define ADDRMASK (NBYTES-1) 161 162 .text 163 .set noreorder 164 .set noat 165 166/* 167 * A combined memcpy/__copy_user 168 * __copy_user sets len to 0 for success; else to an upper bound of 169 * the number of uncopied bytes. 170 * memcpy sets v0 to dst. 171 */ 172 .align 5 173LEAF(memcpy) /* a0=dst a1=src a2=len */ 174 move v0, dst /* return value */ 175__memcpy: 176FEXPORT(__copy_user) 177 /* 178 * Note: dst & src may be unaligned, len may be 0 179 * Temps 180 */ 181 # 182 # Octeon doesn't care if the destination is unaligned. The hardware 183 # can fix it faster than we can special case the assembly. 184 # 185 pref 0, 0(src) 186 sltu t0, len, NBYTES # Check if < 1 word 187 bnez t0, copy_bytes_checklen 188 and t0, src, ADDRMASK # Check if src unaligned 189 bnez t0, src_unaligned 190 sltu t0, len, 4*NBYTES # Check if < 4 words 191 bnez t0, less_than_4units 192 sltu t0, len, 8*NBYTES # Check if < 8 words 193 bnez t0, less_than_8units 194 sltu t0, len, 16*NBYTES # Check if < 16 words 195 bnez t0, cleanup_both_aligned 196 sltu t0, len, 128+1 # Check if len < 129 197 bnez t0, 1f # Skip prefetch if len is too short 198 sltu t0, len, 256+1 # Check if len < 257 199 bnez t0, 1f # Skip prefetch if len is too short 200 pref 0, 128(src) # We must not prefetch invalid addresses 201 # 202 # This is where we loop if there is more than 128 bytes left 2032: pref 0, 256(src) # We must not prefetch invalid addresses 204 # 205 # This is where we loop if we can't prefetch anymore 2061: 207EXC( LOAD t0, UNIT(0)(src), l_exc) 208EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 209EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 210EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 211 SUB len, len, 16*NBYTES 212EXC( STORE t0, UNIT(0)(dst), s_exc_p16u) 213EXC( STORE t1, UNIT(1)(dst), s_exc_p15u) 214EXC( STORE t2, UNIT(2)(dst), s_exc_p14u) 215EXC( STORE t3, UNIT(3)(dst), s_exc_p13u) 216EXC( LOAD t0, UNIT(4)(src), l_exc_copy) 217EXC( LOAD t1, UNIT(5)(src), l_exc_copy) 218EXC( LOAD t2, UNIT(6)(src), l_exc_copy) 219EXC( LOAD t3, UNIT(7)(src), l_exc_copy) 220EXC( STORE t0, UNIT(4)(dst), s_exc_p12u) 221EXC( STORE t1, UNIT(5)(dst), s_exc_p11u) 222EXC( STORE t2, UNIT(6)(dst), s_exc_p10u) 223 ADD src, src, 16*NBYTES 224EXC( STORE t3, UNIT(7)(dst), s_exc_p9u) 225 ADD dst, dst, 16*NBYTES 226EXC( LOAD t0, UNIT(-8)(src), l_exc_copy) 227EXC( LOAD t1, UNIT(-7)(src), l_exc_copy) 228EXC( LOAD t2, UNIT(-6)(src), l_exc_copy) 229EXC( LOAD t3, UNIT(-5)(src), l_exc_copy) 230EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u) 231EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u) 232EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) 233EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) 234EXC( LOAD t0, UNIT(-4)(src), l_exc_copy) 235EXC( LOAD t1, UNIT(-3)(src), l_exc_copy) 236EXC( LOAD t2, UNIT(-2)(src), l_exc_copy) 237EXC( LOAD t3, UNIT(-1)(src), l_exc_copy) 238EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u) 239EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u) 240EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u) 241EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u) 242 sltu t0, len, 256+1 # See if we can prefetch more 243 beqz t0, 2b 244 sltu t0, len, 128 # See if we can loop more time 245 beqz t0, 1b 246 nop 247 # 248 # Jump here if there are less than 16*NBYTES left. 249 # 250cleanup_both_aligned: 251 beqz len, done 252 sltu t0, len, 8*NBYTES 253 bnez t0, less_than_8units 254 nop 255EXC( LOAD t0, UNIT(0)(src), l_exc) 256EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 257EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 258EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 259 SUB len, len, 8*NBYTES 260EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) 261EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) 262EXC( STORE t2, UNIT(2)(dst), s_exc_p6u) 263EXC( STORE t3, UNIT(3)(dst), s_exc_p5u) 264EXC( LOAD t0, UNIT(4)(src), l_exc_copy) 265EXC( LOAD t1, UNIT(5)(src), l_exc_copy) 266EXC( LOAD t2, UNIT(6)(src), l_exc_copy) 267EXC( LOAD t3, UNIT(7)(src), l_exc_copy) 268EXC( STORE t0, UNIT(4)(dst), s_exc_p4u) 269EXC( STORE t1, UNIT(5)(dst), s_exc_p3u) 270EXC( STORE t2, UNIT(6)(dst), s_exc_p2u) 271EXC( STORE t3, UNIT(7)(dst), s_exc_p1u) 272 ADD src, src, 8*NBYTES 273 beqz len, done 274 ADD dst, dst, 8*NBYTES 275 # 276 # Jump here if there are less than 8*NBYTES left. 277 # 278less_than_8units: 279 sltu t0, len, 4*NBYTES 280 bnez t0, less_than_4units 281 nop 282EXC( LOAD t0, UNIT(0)(src), l_exc) 283EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 284EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 285EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 286 SUB len, len, 4*NBYTES 287EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 288EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 289EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 290EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 291 ADD src, src, 4*NBYTES 292 beqz len, done 293 ADD dst, dst, 4*NBYTES 294 # 295 # Jump here if there are less than 4*NBYTES left. This means 296 # we may need to copy up to 3 NBYTES words. 297 # 298less_than_4units: 299 sltu t0, len, 1*NBYTES 300 bnez t0, copy_bytes_checklen 301 nop 302 # 303 # 1) Copy NBYTES, then check length again 304 # 305EXC( LOAD t0, 0(src), l_exc) 306 SUB len, len, NBYTES 307 sltu t1, len, 8 308EXC( STORE t0, 0(dst), s_exc_p1u) 309 ADD src, src, NBYTES 310 bnez t1, copy_bytes_checklen 311 ADD dst, dst, NBYTES 312 # 313 # 2) Copy NBYTES, then check length again 314 # 315EXC( LOAD t0, 0(src), l_exc) 316 SUB len, len, NBYTES 317 sltu t1, len, 8 318EXC( STORE t0, 0(dst), s_exc_p1u) 319 ADD src, src, NBYTES 320 bnez t1, copy_bytes_checklen 321 ADD dst, dst, NBYTES 322 # 323 # 3) Copy NBYTES, then check length again 324 # 325EXC( LOAD t0, 0(src), l_exc) 326 SUB len, len, NBYTES 327 ADD src, src, NBYTES 328 ADD dst, dst, NBYTES 329 b copy_bytes_checklen 330EXC( STORE t0, -8(dst), s_exc_p1u) 331 332src_unaligned: 333#define rem t8 334 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 335 beqz t0, cleanup_src_unaligned 336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 3371: 338/* 339 * Avoid consecutive LD*'s to the same register since some mips 340 * implementations can't issue them in the same cycle. 341 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 342 * are to the same unit (unless src is aligned, but it's not). 343 */ 344EXC( LDFIRST t0, FIRST(0)(src), l_exc) 345EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 346 SUB len, len, 4*NBYTES 347EXC( LDREST t0, REST(0)(src), l_exc_copy) 348EXC( LDREST t1, REST(1)(src), l_exc_copy) 349EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 350EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 351EXC( LDREST t2, REST(2)(src), l_exc_copy) 352EXC( LDREST t3, REST(3)(src), l_exc_copy) 353 ADD src, src, 4*NBYTES 354EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 355EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 356EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 357EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 358 bne len, rem, 1b 359 ADD dst, dst, 4*NBYTES 360 361cleanup_src_unaligned: 362 beqz len, done 363 and rem, len, NBYTES-1 # rem = len % NBYTES 364 beq rem, len, copy_bytes 365 nop 3661: 367EXC( LDFIRST t0, FIRST(0)(src), l_exc) 368EXC( LDREST t0, REST(0)(src), l_exc_copy) 369 SUB len, len, NBYTES 370EXC( STORE t0, 0(dst), s_exc_p1u) 371 ADD src, src, NBYTES 372 bne len, rem, 1b 373 ADD dst, dst, NBYTES 374 375copy_bytes_checklen: 376 beqz len, done 377 nop 378copy_bytes: 379 /* 0 < len < NBYTES */ 380#define COPY_BYTE(N) \ 381EXC( lb t0, N(src), l_exc); \ 382 SUB len, len, 1; \ 383 beqz len, done; \ 384EXC( sb t0, N(dst), s_exc_p1) 385 386 COPY_BYTE(0) 387 COPY_BYTE(1) 388#ifdef USE_DOUBLE 389 COPY_BYTE(2) 390 COPY_BYTE(3) 391 COPY_BYTE(4) 392 COPY_BYTE(5) 393#endif 394EXC( lb t0, NBYTES-2(src), l_exc) 395 SUB len, len, 1 396 jr ra 397EXC( sb t0, NBYTES-2(dst), s_exc_p1) 398done: 399 jr ra 400 nop 401 END(memcpy) 402 403l_exc_copy: 404 /* 405 * Copy bytes from src until faulting load address (or until a 406 * lb faults) 407 * 408 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 409 * may be more than a byte beyond the last address. 410 * Hence, the lb below may get an exception. 411 * 412 * Assumes src < THREAD_BUADDR($28) 413 */ 414 LOAD t0, TI_TASK($28) 415 nop 416 LOAD t0, THREAD_BUADDR(t0) 4171: 418EXC( lb t1, 0(src), l_exc) 419 ADD src, src, 1 420 sb t1, 0(dst) # can't fault -- we're copy_from_user 421 bne src, t0, 1b 422 ADD dst, dst, 1 423l_exc: 424 LOAD t0, TI_TASK($28) 425 nop 426 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 427 nop 428 SUB len, AT, t0 # len number of uncopied bytes 429 /* 430 * Here's where we rely on src and dst being incremented in tandem, 431 * See (3) above. 432 * dst += (fault addr - src) to put dst at first byte to clear 433 */ 434 ADD dst, t0 # compute start address in a1 435 SUB dst, src 436 /* 437 * Clear len bytes starting at dst. Can't call __bzero because it 438 * might modify len. An inefficient loop for these rare times... 439 */ 440 beqz len, done 441 SUB src, len, 1 4421: sb zero, 0(dst) 443 ADD dst, dst, 1 444 bnez src, 1b 445 SUB src, src, 1 446 jr ra 447 nop 448 449 450#define SEXC(n) \ 451s_exc_p ## n ## u: \ 452 jr ra; \ 453 ADD len, len, n*NBYTES 454 455SEXC(16) 456SEXC(15) 457SEXC(14) 458SEXC(13) 459SEXC(12) 460SEXC(11) 461SEXC(10) 462SEXC(9) 463SEXC(8) 464SEXC(7) 465SEXC(6) 466SEXC(5) 467SEXC(4) 468SEXC(3) 469SEXC(2) 470SEXC(1) 471 472s_exc_p1: 473 jr ra 474 ADD len, len, 1 475s_exc: 476 jr ra 477 nop 478 479 .align 5 480LEAF(memmove) 481 ADD t0, a0, a2 482 ADD t1, a1, a2 483 sltu t0, a1, t0 # dst + len <= src -> memcpy 484 sltu t1, a0, t1 # dst >= src + len -> memcpy 485 and t0, t1 486 beqz t0, __memcpy 487 move v0, a0 /* return value */ 488 beqz a2, r_out 489 END(memmove) 490 491 /* fall through to __rmemcpy */ 492LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ 493 sltu t0, a1, a0 494 beqz t0, r_end_bytes_up # src >= dst 495 nop 496 ADD a0, a2 # dst = dst + len 497 ADD a1, a2 # src = src + len 498 499r_end_bytes: 500 lb t0, -1(a1) 501 SUB a2, a2, 0x1 502 sb t0, -1(a0) 503 SUB a1, a1, 0x1 504 bnez a2, r_end_bytes 505 SUB a0, a0, 0x1 506 507r_out: 508 jr ra 509 move a2, zero 510 511r_end_bytes_up: 512 lb t0, (a1) 513 SUB a2, a2, 0x1 514 sb t0, (a0) 515 ADD a1, a1, 0x1 516 bnez a2, r_end_bytes_up 517 ADD a0, a0, 0x1 518 519 jr ra 520 move a2, zero 521 END(__rmemcpy) 522