1/* Optimized memcmp implementation for POWER10. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* TODO: Replace macros by the actual instructions when minimum binutils becomes 22 >= 2.35. This is used to keep compatibility with older versions. */ 23#define VEXTRACTBM(rt,vrb) \ 24 .long(((4)<<(32-6)) \ 25 | ((rt)<<(32-11)) \ 26 | ((8)<<(32-16)) \ 27 | ((vrb)<<(32-21)) \ 28 | 1602) 29 30#define LXVP(xtp,dq,ra) \ 31 .long(((6)<<(32-6)) \ 32 | ((((xtp)-32)>>1)<<(32-10)) \ 33 | ((1)<<(32-11)) \ 34 | ((ra)<<(32-16)) \ 35 | dq) 36 37/* Compare 32 bytes. */ 38#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\ 39 LXVP(32+vr1,offset,r3); \ 40 LXVP(32+vr2,offset,r4); \ 41 vcmpneb. v5,vr1+1,vr2+1; \ 42 bne cr6,L(tail_2); \ 43 vcmpneb. v4,vr1,vr2; \ 44 bne cr6,L(tail_1); \ 45 46#define TAIL(v_res,s1,s2) \ 47 vctzlsbb r7,v_res; \ 48 vextubrx r8,r7,s1; \ 49 vextubrx r9,r7,s2; \ 50 subf r3,r9,r8; \ 51 blr; \ 52 53/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], 54 size_t size [r5]) */ 55 56#ifndef MEMCMP 57# define MEMCMP memcmp 58#endif 59 .machine power9 60ENTRY_TOCLESS (MEMCMP, 4) 61 CALL_MCOUNT 3 62 63 cmpldi cr6,r5,64 64 bgt cr6,L(loop_head) 65 66/* Compare 64 bytes. This section is used for lengths <= 64 and for the last 67 bytes for larger lengths. */ 68L(last_compare): 69 li r8,16 70 71 sldi r9,r5,56 72 sldi r8,r8,56 73 addi r6,r3,16 74 addi r7,r4,16 75 76 /* Align up to 16 bytes. */ 77 lxvl 32+v0,r3,r9 78 lxvl 32+v2,r4,r9 79 80 /* The sub. and vcmpneb. results are concatenated by the crnand in order 81 to do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) then 82 loading to CR0.LT. That means r9 is not bigger than 0 and v4 is not 83 all equal to 0. */ 84 sub. r9,r9,r8 85 vcmpneb. v4,v0,v2 86 crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq 87 bt 4*cr0+lt,L(tail1) 88 89 addi r3,r3,32 90 addi r4,r4,32 91 92 lxvl 32+v1,r6,r9 93 lxvl 32+v3,r7,r9 94 sub. r9,r9,r8 95 vcmpneb. v5,v1,v3 96 crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq 97 bt 4*cr0+lt,L(tail2) 98 99 addi r6,r3,16 100 addi r7,r4,16 101 102 lxvl 32+v6,r3,r9 103 lxvl 32+v8,r4,r9 104 sub. r9,r9,r8 105 vcmpneb. v4,v6,v8 106 crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq 107 bt 4*cr0+lt,L(tail3) 108 109 lxvl 32+v7,r6,r9 110 lxvl 32+v9,r7,r9 111 vcmpneb. v5,v7,v9 112 bne cr6,L(tail4) 113 114L(finish): 115 /* The contents are equal. */ 116 li r3,0 117 blr 118 119L(loop_head): 120 /* Calculate how many loops to run. */ 121 srdi. r8,r5,7 122 beq L(loop_tail) 123 mtctr r8 124 125/* Main loop. Compares 128 bytes each loop. */ 126 .p2align 5 127L(loop_128): 128 COMPARE_32(v0,v2,0,tail1,tail2) 129 COMPARE_32(v6,v8,32,tail3,tail4) 130 COMPARE_32(v10,v12,64,tail5,tail6) 131 COMPARE_32(v14,v16,96,tail7,tail8) 132 133 addi r3,r3,128 134 addi r4,r4,128 135 bdnz L(loop_128) 136 137 /* Account loop comparisons. */ 138 clrldi. r5,r5,57 139 beq L(finish) 140 141/* Compares 64 bytes if length is still bigger than 64 bytes. */ 142 .p2align 5 143L(loop_tail): 144 cmpldi r5,64 145 ble L(last_compare) 146 COMPARE_32(v0,v2,0,tail1,tail2) 147 COMPARE_32(v6,v8,32,tail3,tail4) 148 addi r3,r3,64 149 addi r4,r4,64 150 subi r5,r5,64 151 b L(last_compare) 152 153L(tail1): 154 TAIL(v4,v0,v2) 155 156L(tail2): 157 TAIL(v5,v1,v3) 158 159L(tail3): 160 TAIL(v4,v6,v8) 161 162L(tail4): 163 TAIL(v5,v7,v9) 164 165L(tail5): 166 TAIL(v4,v10,v12) 167 168L(tail6): 169 TAIL(v5,v11,v13) 170 171L(tail7): 172 TAIL(v4,v14,v16) 173 174L(tail8): 175 TAIL(v5,v15,v17) 176 177END (MEMCMP) 178libc_hidden_builtin_def (memcmp) 179weak_alias (memcmp, bcmp) 180strong_alias (memcmp, __memcmpeq) 181libc_hidden_def (__memcmpeq) 182