1/* Optimized memcpy implementation for CELL BE PowerPC. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ 22#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ 23 24/* memcpy routine optimized for CELL-BE-PPC v2.0 25 * 26 * The CELL PPC core has 1 integer unit and 1 load/store unit 27 * CELL: 28 * 1st level data cache = 32K 29 * 2nd level data cache = 512K 30 * 3rd level data cache = 0K 31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, 32 * latency to memory is >400 clocks 33 * To improve copy performance we need to prefetch source data 34 * far ahead to hide this latency 35 * For best performance instruction forms ending in "." like "andi." 36 * should be avoided as the are implemented in microcode on CELL. 37 * The below code is loop unrolled for the CELL cache line of 128 bytes 38 */ 39 40.align 7 41 42EALIGN (memcpy, 5, 0) 43 CALL_MCOUNT 44 45 dcbt 0,r4 /* Prefetch ONE SRC cacheline */ 46 cmplwi cr1,r5,16 /* is size < 16 ? */ 47 mr r6,r3 48 blt+ cr1,.Lshortcopy 49 50.Lbigcopy: 51 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ 52 clrlwi r8,r8,32-4 /* align to 16byte boundary */ 53 sub r7,r4,r3 54 cmplwi cr0,r8,0 55 beq+ .Ldst_aligned 56 57.Ldst_unaligned: 58 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ 59 subf r5,r8,r5 60 61 bf cr7*4+3,1f 62 lbzx r0,r7,r6 /* copy 1 byte */ 63 stb r0,0(r6) 64 addi r6,r6,1 651: bf cr7*4+2,2f 66 lhzx r0,r7,r6 /* copy 2 byte */ 67 sth r0,0(r6) 68 addi r6,r6,2 692: bf cr7*4+1,4f 70 lwzx r0,r7,r6 /* copy 4 byte */ 71 stw r0,0(r6) 72 addi r6,r6,4 734: bf cr7*4+0,8f 74 lfdx fp9,r7,r6 /* copy 8 byte */ 75 stfd fp9,0(r6) 76 addi r6,r6,8 778: 78 add r4,r7,r6 79 80.Ldst_aligned: 81 82 cmpwi cr5,r5,128-1 83 84 neg r7,r6 85 addi r6,r6,-8 /* prepare for stfdu */ 86 addi r4,r4,-8 /* prepare for lfdu */ 87 88 clrlwi r7,r7,32-7 /* align to cacheline boundary */ 89 ble+ cr5,.Llessthancacheline 90 91 cmplwi cr6,r7,0 92 subf r5,r7,r5 93 srwi r7,r7,4 /* divide size by 16 */ 94 srwi r10,r5,7 /* number of cache lines to copy */ 95 96 cmplwi r10,0 97 li r11,0 /* number cachelines to copy with prefetch */ 98 beq .Lnocacheprefetch 99 100 cmplwi r10,PREFETCH_AHEAD 101 li r12,128+8 /* prefetch distance */ 102 ble .Llessthanmaxprefetch 103 104 subi r11,r10,PREFETCH_AHEAD 105 li r10,PREFETCH_AHEAD 106 107.Llessthanmaxprefetch: 108 mtctr r10 109 110.LprefetchSRC: 111 dcbt r12,r4 112 addi r12,r12,128 113 bdnz .LprefetchSRC 114 115.Lnocacheprefetch: 116 mtctr r7 117 cmplwi cr1,r5,128 118 clrlwi r5,r5,32-7 119 beq cr6,.Lcachelinealigned 120 121.Laligntocacheline: 122 lfd fp9,0x08(r4) 123 lfdu fp10,0x10(r4) 124 stfd fp9,0x08(r6) 125 stfdu fp10,0x10(r6) 126 bdnz .Laligntocacheline 127 128 129.Lcachelinealigned: /* copy while cache lines */ 130 131 blt- cr1,.Llessthancacheline /* size <128 */ 132 133.Louterloop: 134 cmpwi r11,0 135 mtctr r11 136 beq- .Lendloop 137 138 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ 139 140.align 4 141 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 142.Lloop: /* Copy aligned body */ 143 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 144 lfd fp9, 0x08(r4) 145 dcbz r11,r6 146 lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */ 147 lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */ 148 lfd fp12, 0x20(r4) 149 stfd fp9, 0x08(r6) 150 stfd fp10, 0x10(r6) 151 stfd fp11, 0x18(r6) 152 stfd fp12, 0x20(r6) 153 lfd fp9, 0x28(r4) 154 lfd fp10, 0x30(r4) 155 lfd fp11, 0x38(r4) 156 lfd fp12, 0x40(r4) 157 stfd fp9, 0x28(r6) 158 stfd fp10, 0x30(r6) 159 stfd fp11, 0x38(r6) 160 stfd fp12, 0x40(r6) 161 lfd fp9, 0x48(r4) 162 lfd fp10, 0x50(r4) 163 lfd fp11, 0x58(r4) 164 lfd fp12, 0x60(r4) 165 stfd fp9, 0x48(r6) 166 stfd fp10, 0x50(r6) 167 stfd fp11, 0x58(r6) 168 stfd fp12, 0x60(r6) 169 lfd fp9, 0x68(r4) 170 lfd fp10, 0x70(r4) 171 lfd fp11, 0x78(r4) 172 lfdu fp12, 0x80(r4) 173 stfd fp9, 0x68(r6) 174 stfd fp10, 0x70(r6) 175 stfd fp11, 0x78(r6) 176 stfdu fp12, 0x80(r6) 177 178 bdnz .Lloop 179 180.Lendloop: 181 cmpwi r10,0 182 slwi r10,r10,2 /* adjust from 128 to 32 byte stride */ 183 beq- .Lendloop2 184 mtctr r10 185 186.Lloop2: /* Copy aligned body */ 187 lfd fp9, 0x08(r4) 188 lfd fp10, 0x10(r4) 189 lfd fp11, 0x18(r4) 190 lfdu fp12, 0x20(r4) 191 stfd fp9, 0x08(r6) 192 stfd fp10, 0x10(r6) 193 stfd fp11, 0x18(r6) 194 stfdu fp12, 0x20(r6) 195 196 bdnz .Lloop2 197.Lendloop2: 198 199.Llessthancacheline: /* less than cache to do ? */ 200 cmplwi cr0,r5,16 201 srwi r7,r5,4 /* divide size by 16 */ 202 blt- .Ldo_lt16 203 mtctr r7 204 205.Lcopy_remaining: 206 lfd fp9,0x08(r4) 207 lfdu fp10,0x10(r4) 208 stfd fp9,0x08(r6) 209 stfdu fp10,0x10(r6) 210 bdnz .Lcopy_remaining 211 212.Ldo_lt16: /* less than 16 ? */ 213 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ 214 beqlr+ /* no rest to copy */ 215 addi r4,r4,8 216 addi r6,r6,8 217 218.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ 219 mtcrf 0x01,r5 220 sub r7,r4,r6 221 bf- cr7*4+0,8f 222 lfdx fp9,r7,r6 /* copy 8 byte */ 223 stfd fp9,0(r6) 224 addi r6,r6,8 2258: 226 bf cr7*4+1,4f 227 lwzx r0,r7,r6 /* copy 4 byte */ 228 stw r0,0(r6) 229 addi r6,r6,4 2304: 231 bf cr7*4+2,2f 232 lhzx r0,r7,r6 /* copy 2 byte */ 233 sth r0,0(r6) 234 addi r6,r6,2 2352: 236 bf cr7*4+3,1f 237 lbzx r0,r7,r6 /* copy 1 byte */ 238 stb r0,0(r6) 2391: blr 240 241END (memcpy) 242libc_hidden_builtin_def (memcpy) 243