1/* Optimized memmove implementation for POWER10. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 22/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5]) 23 24 This optimization checks if 'src' and 'dst' overlap. If they do not 25 or 'src' is ahead of 'dest' then it copies forward. 26 Otherwise, an optimized backward copy is used. */ 27 28#ifndef MEMMOVE 29# define MEMMOVE memmove 30#endif 31 .machine power9 32ENTRY_TOCLESS (MEMMOVE, 5) 33 CALL_MCOUNT 3 34 35L(_memmove): 36 .p2align 5 37 /* Check if there is overlap, if so it will branch to backward copy. */ 38 subf r9,r4,r3 39 cmpld cr7,r9,r5 40 blt cr7,L(memmove_bwd) 41 42 /* Fast path for length shorter than 16 bytes. */ 43 sldi r7,r5,56 44 lxvl 32+v2,r4,r7 45 stxvl 32+v2,r3,r7 46 subic. r8,r5,16 47 blelr 48 49 /* For shorter lengths aligning the dest address to 16 bytes either 50 decreases performance or is irrelevant. I'm making use of this 51 comparison to skip the alignment in. */ 52 cmpldi cr6,r5,256 53 bge cr6,L(ge_256) 54 /* Account for the first 16-byte copy. */ 55 addi r4,r4,16 56 addi r11,r3,16 /* use r11 to keep dest address on r3. */ 57 subi r5,r5,16 58 b L(loop_head) 59 60 .p2align 5 61L(ge_256): 62 /* Account for the first copy <= 16 bytes. This is necessary for 63 memmove because at this point the src address can be in front of the 64 dest address. */ 65 clrldi r9,r5,56 66 li r8,16 67 cmpldi r9,16 68 iselgt r9,r8,r9 69 add r4,r4,r9 70 add r11,r3,r9 /* use r11 to keep dest address on r3. */ 71 sub r5,r5,r9 72 73 /* Align dest to 16 bytes. */ 74 neg r7,r3 75 clrldi. r9,r7,60 76 beq L(loop_head) 77 78 .p2align 5 79 sldi r6,r9,56 80 lxvl 32+v0,r4,r6 81 stxvl 32+v0,r11,r6 82 sub r5,r5,r9 83 add r4,r4,r9 84 add r11,r11,r9 85 86L(loop_head): 87 cmpldi r5,63 88 ble L(final_64) 89 90 srdi. r7,r5,7 91 beq L(loop_tail) 92 93 mtctr r7 94 95/* Main loop that copies 128 bytes each iteration. */ 96 .p2align 5 97L(loop): 98 addi r9,r4,64 99 addi r10,r11,64 100 101 lxv 32+v0,0(r4) 102 lxv 32+v1,16(r4) 103 lxv 32+v2,32(r4) 104 lxv 32+v3,48(r4) 105 106 stxv 32+v0,0(r11) 107 stxv 32+v1,16(r11) 108 stxv 32+v2,32(r11) 109 stxv 32+v3,48(r11) 110 111 addi r4,r4,128 112 addi r11,r11,128 113 114 lxv 32+v4,0(r9) 115 lxv 32+v5,16(r9) 116 lxv 32+v6,32(r9) 117 lxv 32+v7,48(r9) 118 119 stxv 32+v4,0(r10) 120 stxv 32+v5,16(r10) 121 stxv 32+v6,32(r10) 122 stxv 32+v7,48(r10) 123 124 bdnz L(loop) 125 clrldi. r5,r5,57 126 beqlr 127 128/* Copy 64 bytes. */ 129 .p2align 5 130L(loop_tail): 131 cmpldi cr5,r5,63 132 ble cr5,L(final_64) 133 134 lxv 32+v0,0(r4) 135 lxv 32+v1,16(r4) 136 lxv 32+v2,32(r4) 137 lxv 32+v3,48(r4) 138 139 stxv 32+v0,0(r11) 140 stxv 32+v1,16(r11) 141 stxv 32+v2,32(r11) 142 stxv 32+v3,48(r11) 143 144 addi r4,r4,64 145 addi r11,r11,64 146 subi r5,r5,64 147 148/* Copies the last 1-63 bytes. */ 149 .p2align 5 150L(final_64): 151 /* r8 holds the number of bytes that will be copied with lxv/stxv. */ 152 clrrdi. r8,r5,4 153 beq L(tail1) 154 155 cmpldi cr5,r5,32 156 lxv 32+v0,0(r4) 157 blt cr5,L(tail2) 158 159 cmpldi cr6,r5,48 160 lxv 32+v1,16(r4) 161 blt cr6,L(tail3) 162 163 .p2align 5 164 lxv 32+v2,32(r4) 165 stxv 32+v2,32(r11) 166L(tail3): 167 stxv 32+v1,16(r11) 168L(tail2): 169 stxv 32+v0,0(r11) 170 sub r5,r5,r8 171 add r4,r4,r8 172 add r11,r11,r8 173 .p2align 5 174L(tail1): 175 sldi r6,r5,56 176 lxvl v4,r4,r6 177 stxvl v4,r11,r6 178 blr 179 180/* If dest and src overlap, we should copy backwards. */ 181L(memmove_bwd): 182 add r11,r3,r5 183 add r4,r4,r5 184 185 /* Optimization for length smaller than 16 bytes. */ 186 cmpldi cr5,r5,15 187 ble cr5,L(tail1_bwd) 188 189 /* For shorter lengths the alignment either slows down or is irrelevant. 190 The forward copy uses a already need 256 comparison for that. Here 191 it's using 128 as it will reduce code and improve readability. */ 192 cmpldi cr7,r5,128 193 blt cr7,L(bwd_loop_tail) 194 195 /* Align dest address to 16 bytes. */ 196 .p2align 5 197 clrldi. r9,r11,60 198 beq L(bwd_loop_head) 199 sub r4,r4,r9 200 sub r11,r11,r9 201 lxv 32+v0,0(r4) 202 sldi r6,r9,56 203 stxvl 32+v0,r11,r6 204 sub r5,r5,r9 205 206L(bwd_loop_head): 207 srdi. r7,r5,7 208 beq L(bwd_loop_tail) 209 210 mtctr r7 211 212/* Main loop that copies 128 bytes every iteration. */ 213 .p2align 5 214L(bwd_loop): 215 addi r9,r4,-64 216 addi r10,r11,-64 217 218 lxv 32+v0,-16(r4) 219 lxv 32+v1,-32(r4) 220 lxv 32+v2,-48(r4) 221 lxv 32+v3,-64(r4) 222 223 stxv 32+v0,-16(r11) 224 stxv 32+v1,-32(r11) 225 stxv 32+v2,-48(r11) 226 stxv 32+v3,-64(r11) 227 228 addi r4,r4,-128 229 addi r11,r11,-128 230 231 lxv 32+v0,-16(r9) 232 lxv 32+v1,-32(r9) 233 lxv 32+v2,-48(r9) 234 lxv 32+v3,-64(r9) 235 236 stxv 32+v0,-16(r10) 237 stxv 32+v1,-32(r10) 238 stxv 32+v2,-48(r10) 239 stxv 32+v3,-64(r10) 240 241 bdnz L(bwd_loop) 242 clrldi. r5,r5,57 243 beqlr 244 245/* Copy 64 bytes. */ 246 .p2align 5 247L(bwd_loop_tail): 248 cmpldi cr5,r5,63 249 ble cr5,L(bwd_final_64) 250 251 addi r4,r4,-64 252 addi r11,r11,-64 253 254 lxv 32+v0,0(r4) 255 lxv 32+v1,16(r4) 256 lxv 32+v2,32(r4) 257 lxv 32+v3,48(r4) 258 259 stxv 32+v0,0(r11) 260 stxv 32+v1,16(r11) 261 stxv 32+v2,32(r11) 262 stxv 32+v3,48(r11) 263 264 subi r5,r5,64 265 266/* Copies the last 1-63 bytes. */ 267 .p2align 5 268L(bwd_final_64): 269 /* r8 holds the number of bytes that will be copied with lxv/stxv. */ 270 clrrdi. r8,r5,4 271 beq L(tail1_bwd) 272 273 cmpldi cr5,r5,32 274 lxv 32+v2,-16(r4) 275 blt cr5,L(tail2_bwd) 276 277 cmpldi cr6,r5,48 278 lxv 32+v1,-32(r4) 279 blt cr6,L(tail3_bwd) 280 281 .p2align 5 282 lxv 32+v0,-48(r4) 283 stxv 32+v0,-48(r11) 284L(tail3_bwd): 285 stxv 32+v1,-32(r11) 286L(tail2_bwd): 287 stxv 32+v2,-16(r11) 288 sub r4,r4,r5 289 sub r11,r11,r5 290 sub r5,r5,r8 291 sldi r6,r5,56 292 lxvl v4,r4,r6 293 stxvl v4,r11,r6 294 blr 295 296/* Copy last 16 bytes. */ 297 .p2align 5 298L(tail1_bwd): 299 sub r4,r4,r5 300 sub r11,r11,r5 301 sldi r6,r5,56 302 lxvl v4,r4,r6 303 stxvl v4,r11,r6 304 blr 305 306END_GEN_TB (MEMMOVE,TB_TOCLESS) 307libc_hidden_builtin_def (memmove) 308