1/* memrchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23# include <sysdep.h> 24# include "evex256-vecs.h" 25# if VEC_SIZE != 32 26# error "VEC_SIZE != 32 unimplemented" 27# endif 28 29# ifndef MEMRCHR 30# define MEMRCHR __memrchr_evex 31# endif 32 33# define PAGE_SIZE 4096 34# define VECMATCH VEC(0) 35 36 .section SECTION(.text), "ax", @progbits 37ENTRY_P2ALIGN(MEMRCHR, 6) 38# ifdef __ILP32__ 39 /* Clear upper bits. */ 40 and %RDX_LP, %RDX_LP 41# else 42 test %RDX_LP, %RDX_LP 43# endif 44 jz L(zero_0) 45 46 /* Get end pointer. Minus one for two reasons. 1) It is necessary for a 47 correct page cross check and 2) it correctly sets up end ptr to be 48 subtract by lzcnt aligned. */ 49 leaq -1(%rdi, %rdx), %rax 50 vpbroadcastb %esi, %VECMATCH 51 52 /* Check if we can load 1x VEC without cross a page. */ 53 testl $(PAGE_SIZE - VEC_SIZE), %eax 54 jz L(page_cross) 55 56 /* Don't use rax for pointer here because EVEX has better encoding with 57 offset % VEC_SIZE == 0. */ 58 vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 59 kmovd %k0, %ecx 60 61 /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ 62 cmpq $VEC_SIZE, %rdx 63 ja L(more_1x_vec) 64L(ret_vec_x0_test): 65 66 /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which 67 will guarantee edx (len) is less than it. */ 68 lzcntl %ecx, %ecx 69 cmpl %ecx, %edx 70 jle L(zero_0) 71 subq %rcx, %rax 72 ret 73 74 /* Fits in aligning bytes of first cache line. */ 75L(zero_0): 76 xorl %eax, %eax 77 ret 78 79 .p2align 4,, 9 80L(ret_vec_x0_dec): 81 decq %rax 82L(ret_vec_x0): 83 lzcntl %ecx, %ecx 84 subq %rcx, %rax 85 ret 86 87 .p2align 4,, 10 88L(more_1x_vec): 89 testl %ecx, %ecx 90 jnz L(ret_vec_x0) 91 92 /* Align rax (pointer to string). */ 93 andq $-VEC_SIZE, %rax 94 95 /* Recompute length after aligning. */ 96 movq %rax, %rdx 97 98 /* Need no matter what. */ 99 vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 100 kmovd %k0, %ecx 101 102 subq %rdi, %rdx 103 104 cmpq $(VEC_SIZE * 2), %rdx 105 ja L(more_2x_vec) 106L(last_2x_vec): 107 108 /* Must dec rax because L(ret_vec_x0_test) expects it. */ 109 decq %rax 110 cmpl $VEC_SIZE, %edx 111 jbe L(ret_vec_x0_test) 112 113 testl %ecx, %ecx 114 jnz L(ret_vec_x0) 115 116 /* Don't use rax for pointer here because EVEX has better encoding with 117 offset % VEC_SIZE == 0. */ 118 vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 119 kmovd %k0, %ecx 120 /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ 121 lzcntq %rcx, %rcx 122 cmpl %ecx, %edx 123 jle L(zero_0) 124 subq %rcx, %rax 125 ret 126 127 /* Inexpensive place to put this regarding code size / target alignments 128 / ICache NLP. Necessary for 2-byte encoding of jump to page cross 129 case which in turn is necessary for hot path (len <= VEC_SIZE) to fit 130 in first cache line. */ 131L(page_cross): 132 movq %rax, %rsi 133 andq $-VEC_SIZE, %rsi 134 vpcmpb $0, (%rsi), %VECMATCH, %k0 135 kmovd %k0, %r8d 136 /* Shift out negative alignment (because we are starting from endptr and 137 working backwards). */ 138 movl %eax, %ecx 139 /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ 140 notl %ecx 141 shlxl %ecx, %r8d, %ecx 142 cmpq %rdi, %rsi 143 ja L(more_1x_vec) 144 lzcntl %ecx, %ecx 145 cmpl %ecx, %edx 146 jle L(zero_1) 147 subq %rcx, %rax 148 ret 149 150 /* Continue creating zero labels that fit in aligning bytes and get 151 2-byte encoding / are in the same cache line as condition. */ 152L(zero_1): 153 xorl %eax, %eax 154 ret 155 156 .p2align 4,, 8 157L(ret_vec_x1): 158 /* This will naturally add 32 to position. */ 159 bsrl %ecx, %ecx 160 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax 161 ret 162 163 .p2align 4,, 8 164L(more_2x_vec): 165 testl %ecx, %ecx 166 jnz L(ret_vec_x0_dec) 167 168 vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 169 kmovd %k0, %ecx 170 testl %ecx, %ecx 171 jnz L(ret_vec_x1) 172 173 /* Need no matter what. */ 174 vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 175 kmovd %k0, %ecx 176 177 subq $(VEC_SIZE * 4), %rdx 178 ja L(more_4x_vec) 179 180 cmpl $(VEC_SIZE * -1), %edx 181 jle L(ret_vec_x2_test) 182L(last_vec): 183 testl %ecx, %ecx 184 jnz L(ret_vec_x2) 185 186 187 /* Need no matter what. */ 188 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 189 kmovd %k0, %ecx 190 lzcntl %ecx, %ecx 191 subq $(VEC_SIZE * 3 + 1), %rax 192 subq %rcx, %rax 193 cmpq %rax, %rdi 194 ja L(zero_1) 195 ret 196 197 .p2align 4,, 8 198L(ret_vec_x2_test): 199 lzcntl %ecx, %ecx 200 subq $(VEC_SIZE * 2 + 1), %rax 201 subq %rcx, %rax 202 cmpq %rax, %rdi 203 ja L(zero_1) 204 ret 205 206 .p2align 4,, 8 207L(ret_vec_x2): 208 bsrl %ecx, %ecx 209 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax 210 ret 211 212 .p2align 4,, 8 213L(ret_vec_x3): 214 bsrl %ecx, %ecx 215 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax 216 ret 217 218 .p2align 4,, 8 219L(more_4x_vec): 220 testl %ecx, %ecx 221 jnz L(ret_vec_x2) 222 223 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 224 kmovd %k0, %ecx 225 226 testl %ecx, %ecx 227 jnz L(ret_vec_x3) 228 229 /* Check if near end before re-aligning (otherwise might do an 230 unnecessary loop iteration). */ 231 addq $-(VEC_SIZE * 4), %rax 232 cmpq $(VEC_SIZE * 4), %rdx 233 jbe L(last_4x_vec) 234 235 decq %rax 236 andq $-(VEC_SIZE * 4), %rax 237 movq %rdi, %rdx 238 /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because 239 lengths that overflow can be valid and break the comparison. */ 240 andq $-(VEC_SIZE * 4), %rdx 241 242 .p2align 4 243L(loop_4x_vec): 244 /* Store 1 were not-equals and 0 where equals in k1 (used to mask later 245 on). */ 246 vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 247 248 /* VEC(2/3) will have zero-byte where we found a CHAR. */ 249 vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) 250 vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) 251 vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 252 253 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where 254 CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ 255 vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} 256 vptestnmb %VEC(3), %VEC(3), %k2 257 258 /* Any 1s and we found CHAR. */ 259 kortestd %k2, %k4 260 jnz L(loop_end) 261 262 addq $-(VEC_SIZE * 4), %rax 263 cmpq %rdx, %rax 264 jne L(loop_4x_vec) 265 266 /* Need to re-adjust rdx / rax for L(last_4x_vec). */ 267 subq $-(VEC_SIZE * 4), %rdx 268 movq %rdx, %rax 269 subl %edi, %edx 270L(last_4x_vec): 271 272 /* Used no matter what. */ 273 vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 274 kmovd %k0, %ecx 275 276 cmpl $(VEC_SIZE * 2), %edx 277 jbe L(last_2x_vec) 278 279 testl %ecx, %ecx 280 jnz L(ret_vec_x0_dec) 281 282 283 vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 284 kmovd %k0, %ecx 285 286 testl %ecx, %ecx 287 jnz L(ret_vec_x1) 288 289 /* Used no matter what. */ 290 vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 291 kmovd %k0, %ecx 292 293 cmpl $(VEC_SIZE * 3), %edx 294 ja L(last_vec) 295 296 lzcntl %ecx, %ecx 297 subq $(VEC_SIZE * 2 + 1), %rax 298 subq %rcx, %rax 299 cmpq %rax, %rdi 300 jbe L(ret_1) 301 xorl %eax, %eax 302L(ret_1): 303 ret 304 305 .p2align 4,, 6 306L(loop_end): 307 kmovd %k1, %ecx 308 notl %ecx 309 testl %ecx, %ecx 310 jnz L(ret_vec_x0_end) 311 312 vptestnmb %VEC(2), %VEC(2), %k0 313 kmovd %k0, %ecx 314 testl %ecx, %ecx 315 jnz L(ret_vec_x1_end) 316 317 kmovd %k2, %ecx 318 kmovd %k4, %esi 319 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) 320 then it won't affect the result in esi (VEC4). If ecx is non-zero 321 then CHAR in VEC3 and bsrq will use that position. */ 322 salq $32, %rcx 323 orq %rsi, %rcx 324 bsrq %rcx, %rcx 325 addq %rcx, %rax 326 ret 327 .p2align 4,, 4 328L(ret_vec_x0_end): 329 addq $(VEC_SIZE), %rax 330L(ret_vec_x1_end): 331 bsrl %ecx, %ecx 332 leaq (VEC_SIZE * 2)(%rax, %rcx), %rax 333 ret 334 335END(MEMRCHR) 336#endif 337