1/* memrchr optimized with SSE2. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation 22 so we need this to build for ISA V2 builds. */ 23#if ISA_SHOULD_BUILD (2) 24 25# ifndef MEMRCHR 26# define MEMRCHR __memrchr_sse2 27# endif 28 29# include <sysdep.h> 30# define VEC_SIZE 16 31# define PAGE_SIZE 4096 32 33 .text 34ENTRY_P2ALIGN(MEMRCHR, 6) 35# ifdef __ILP32__ 36 /* Clear upper bits. */ 37 mov %RDX_LP, %RDX_LP 38# endif 39 movd %esi, %xmm0 40 41 /* Get end pointer. */ 42 leaq (%rdx, %rdi), %rcx 43 44 punpcklbw %xmm0, %xmm0 45 punpcklwd %xmm0, %xmm0 46 pshufd $0, %xmm0, %xmm0 47 48 /* Check if we can load 1x VEC without cross a page. */ 49 testl $(PAGE_SIZE - VEC_SIZE), %ecx 50 jz L(page_cross) 51 52 /* NB: This load happens regardless of whether rdx (len) is zero. Since 53 it doesn't cross a page and the standard gurantees any pointer have 54 at least one-valid byte this load must be safe. For the entire 55 history of the x86 memrchr implementation this has been possible so 56 no code "should" be relying on a zero-length check before this load. 57 The zero-length check is moved to the page cross case because it is 58 1) pretty cold and including it pushes the hot case len <= VEC_SIZE 59 into 2-cache lines. */ 60 movups -(VEC_SIZE)(%rcx), %xmm1 61 pcmpeqb %xmm0, %xmm1 62 pmovmskb %xmm1, %eax 63 64 subq $VEC_SIZE, %rdx 65 ja L(more_1x_vec) 66L(ret_vec_x0_test): 67 /* Zero-flag set if eax (src) is zero. Destination unchanged if src is 68 zero. */ 69 bsrl %eax, %eax 70 jz L(ret_0) 71 /* Check if the CHAR match is in bounds. Need to truly zero `eax` here 72 if out of bounds. */ 73 addl %edx, %eax 74 jl L(zero_0) 75 /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base 76 ptr. */ 77 addq %rdi, %rax 78L(ret_0): 79 ret 80 81 .p2align 4,, 5 82L(ret_vec_x0): 83 bsrl %eax, %eax 84 leaq -(VEC_SIZE)(%rcx, %rax), %rax 85 ret 86 87 .p2align 4,, 2 88L(zero_0): 89 xorl %eax, %eax 90 ret 91 92 93 .p2align 4,, 8 94L(more_1x_vec): 95 testl %eax, %eax 96 jnz L(ret_vec_x0) 97 98 /* Align rcx (pointer to string). */ 99 decq %rcx 100 andq $-VEC_SIZE, %rcx 101 102 movq %rcx, %rdx 103 /* NB: We could consistenyl save 1-byte in this pattern with `movaps 104 %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is 105 it adds more frontend uops (even if the moves can be eliminated) and 106 some percentage of the time actual backend uops. */ 107 movaps -(VEC_SIZE)(%rcx), %xmm1 108 pcmpeqb %xmm0, %xmm1 109 subq %rdi, %rdx 110 pmovmskb %xmm1, %eax 111 112 cmpq $(VEC_SIZE * 2), %rdx 113 ja L(more_2x_vec) 114L(last_2x_vec): 115 subl $VEC_SIZE, %edx 116 jbe L(ret_vec_x0_test) 117 118 testl %eax, %eax 119 jnz L(ret_vec_x0) 120 121 movaps -(VEC_SIZE * 2)(%rcx), %xmm1 122 pcmpeqb %xmm0, %xmm1 123 pmovmskb %xmm1, %eax 124 125 subl $VEC_SIZE, %edx 126 bsrl %eax, %eax 127 jz L(ret_1) 128 addl %edx, %eax 129 jl L(zero_0) 130 addq %rdi, %rax 131L(ret_1): 132 ret 133 134 /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) 135 causes the hot pause (length <= VEC_SIZE) to span multiple cache 136 lines. Naturally aligned % 16 to 8-bytes. */ 137L(page_cross): 138 /* Zero length check. */ 139 testq %rdx, %rdx 140 jz L(zero_0) 141 142 leaq -1(%rcx), %r8 143 andq $-(VEC_SIZE), %r8 144 145 movaps (%r8), %xmm1 146 pcmpeqb %xmm0, %xmm1 147 pmovmskb %xmm1, %esi 148 /* Shift out negative alignment (because we are starting from endptr and 149 working backwards). */ 150 negl %ecx 151 /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count 152 explicitly. */ 153 andl $(VEC_SIZE - 1), %ecx 154 shl %cl, %esi 155 movzwl %si, %eax 156 leaq (%rdi, %rdx), %rcx 157 cmpq %rdi, %r8 158 ja L(more_1x_vec) 159 subl $VEC_SIZE, %edx 160 bsrl %eax, %eax 161 jz L(ret_2) 162 addl %edx, %eax 163 jl L(zero_1) 164 addq %rdi, %rax 165L(ret_2): 166 ret 167 168 /* Fits in aliging bytes. */ 169L(zero_1): 170 xorl %eax, %eax 171 ret 172 173 .p2align 4,, 5 174L(ret_vec_x1): 175 bsrl %eax, %eax 176 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax 177 ret 178 179 .p2align 4,, 8 180L(more_2x_vec): 181 testl %eax, %eax 182 jnz L(ret_vec_x0) 183 184 movaps -(VEC_SIZE * 2)(%rcx), %xmm1 185 pcmpeqb %xmm0, %xmm1 186 pmovmskb %xmm1, %eax 187 testl %eax, %eax 188 jnz L(ret_vec_x1) 189 190 191 movaps -(VEC_SIZE * 3)(%rcx), %xmm1 192 pcmpeqb %xmm0, %xmm1 193 pmovmskb %xmm1, %eax 194 195 subq $(VEC_SIZE * 4), %rdx 196 ja L(more_4x_vec) 197 198 addl $(VEC_SIZE), %edx 199 jle L(ret_vec_x2_test) 200 201L(last_vec): 202 testl %eax, %eax 203 jnz L(ret_vec_x2) 204 205 movaps -(VEC_SIZE * 4)(%rcx), %xmm1 206 pcmpeqb %xmm0, %xmm1 207 pmovmskb %xmm1, %eax 208 209 subl $(VEC_SIZE), %edx 210 bsrl %eax, %eax 211 jz L(ret_3) 212 addl %edx, %eax 213 jl L(zero_2) 214 addq %rdi, %rax 215L(ret_3): 216 ret 217 218 .p2align 4,, 6 219L(ret_vec_x2_test): 220 bsrl %eax, %eax 221 jz L(zero_2) 222 addl %edx, %eax 223 jl L(zero_2) 224 addq %rdi, %rax 225 ret 226 227L(zero_2): 228 xorl %eax, %eax 229 ret 230 231 232 .p2align 4,, 5 233L(ret_vec_x2): 234 bsrl %eax, %eax 235 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax 236 ret 237 238 .p2align 4,, 5 239L(ret_vec_x3): 240 bsrl %eax, %eax 241 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax 242 ret 243 244 .p2align 4,, 8 245L(more_4x_vec): 246 testl %eax, %eax 247 jnz L(ret_vec_x2) 248 249 movaps -(VEC_SIZE * 4)(%rcx), %xmm1 250 pcmpeqb %xmm0, %xmm1 251 pmovmskb %xmm1, %eax 252 253 testl %eax, %eax 254 jnz L(ret_vec_x3) 255 256 addq $-(VEC_SIZE * 4), %rcx 257 cmpq $(VEC_SIZE * 4), %rdx 258 jbe L(last_4x_vec) 259 260 /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end 261 keeping the code from spilling to the next cache line. */ 262 addq $(VEC_SIZE * 4 - 1), %rcx 263 andq $-(VEC_SIZE * 4), %rcx 264 leaq (VEC_SIZE * 4)(%rdi), %rdx 265 andq $-(VEC_SIZE * 4), %rdx 266 267 .p2align 4,, 11 268L(loop_4x_vec): 269 movaps (VEC_SIZE * -1)(%rcx), %xmm1 270 movaps (VEC_SIZE * -2)(%rcx), %xmm2 271 movaps (VEC_SIZE * -3)(%rcx), %xmm3 272 movaps (VEC_SIZE * -4)(%rcx), %xmm4 273 pcmpeqb %xmm0, %xmm1 274 pcmpeqb %xmm0, %xmm2 275 pcmpeqb %xmm0, %xmm3 276 pcmpeqb %xmm0, %xmm4 277 278 por %xmm1, %xmm2 279 por %xmm3, %xmm4 280 por %xmm2, %xmm4 281 282 pmovmskb %xmm4, %esi 283 testl %esi, %esi 284 jnz L(loop_end) 285 286 addq $-(VEC_SIZE * 4), %rcx 287 cmpq %rdx, %rcx 288 jne L(loop_4x_vec) 289 290 subl %edi, %edx 291 292 /* Ends up being 1-byte nop. */ 293 .p2align 4,, 2 294L(last_4x_vec): 295 movaps -(VEC_SIZE)(%rcx), %xmm1 296 pcmpeqb %xmm0, %xmm1 297 pmovmskb %xmm1, %eax 298 299 cmpl $(VEC_SIZE * 2), %edx 300 jbe L(last_2x_vec) 301 302 testl %eax, %eax 303 jnz L(ret_vec_x0) 304 305 306 movaps -(VEC_SIZE * 2)(%rcx), %xmm1 307 pcmpeqb %xmm0, %xmm1 308 pmovmskb %xmm1, %eax 309 310 testl %eax, %eax 311 jnz L(ret_vec_end) 312 313 movaps -(VEC_SIZE * 3)(%rcx), %xmm1 314 pcmpeqb %xmm0, %xmm1 315 pmovmskb %xmm1, %eax 316 317 subl $(VEC_SIZE * 3), %edx 318 ja L(last_vec) 319 bsrl %eax, %eax 320 jz L(ret_4) 321 addl %edx, %eax 322 jl L(zero_3) 323 addq %rdi, %rax 324L(ret_4): 325 ret 326 327 /* Ends up being 1-byte nop. */ 328 .p2align 4,, 3 329L(loop_end): 330 pmovmskb %xmm1, %eax 331 sall $16, %eax 332 jnz L(ret_vec_end) 333 334 pmovmskb %xmm2, %eax 335 testl %eax, %eax 336 jnz L(ret_vec_end) 337 338 pmovmskb %xmm3, %eax 339 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) 340 then it won't affect the result in esi (VEC4). If ecx is non-zero 341 then CHAR in VEC3 and bsrq will use that position. */ 342 sall $16, %eax 343 orl %esi, %eax 344 bsrl %eax, %eax 345 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax 346 ret 347 348L(ret_vec_end): 349 bsrl %eax, %eax 350 leaq (VEC_SIZE * -2)(%rax, %rcx), %rax 351 ret 352 /* Use in L(last_4x_vec). In the same cache line. This is just a spare 353 aligning bytes. */ 354L(zero_3): 355 xorl %eax, %eax 356 ret 357 /* 2-bytes from next cache line. */ 358END(MEMRCHR) 359#endif 360