1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <isa-level.h> 20 21#if ISA_SHOULD_BUILD (4) 22 23# include <sysdep.h> 24 25# ifndef STRRCHR 26# define STRRCHR __strrchr_evex 27# endif 28 29# define VMOVU vmovdqu64 30# define VMOVA vmovdqa64 31 32# ifdef USE_AS_WCSRCHR 33# define SHIFT_REG esi 34 35# define kunpck kunpckbw 36# define kmov_2x kmovd 37# define maskz_2x ecx 38# define maskm_2x eax 39# define CHAR_SIZE 4 40# define VPMIN vpminud 41# define VPTESTN vptestnmd 42# define VPBROADCAST vpbroadcastd 43# define VPCMP vpcmpd 44# else 45# define SHIFT_REG edi 46 47# define kunpck kunpckdq 48# define kmov_2x kmovq 49# define maskz_2x rcx 50# define maskm_2x rax 51 52# define CHAR_SIZE 1 53# define VPMIN vpminub 54# define VPTESTN vptestnmb 55# define VPBROADCAST vpbroadcastb 56# define VPCMP vpcmpb 57# endif 58 59# define XMMZERO xmm16 60# define YMMZERO ymm16 61# define YMMMATCH ymm17 62# define YMMSAVE ymm18 63 64# define YMM1 ymm19 65# define YMM2 ymm20 66# define YMM3 ymm21 67# define YMM4 ymm22 68# define YMM5 ymm23 69# define YMM6 ymm24 70# define YMM7 ymm25 71# define YMM8 ymm26 72 73 74# define VEC_SIZE 32 75# define PAGE_SIZE 4096 76 .section .text.evex, "ax", @progbits 77ENTRY(STRRCHR) 78 movl %edi, %eax 79 /* Broadcast CHAR to YMMMATCH. */ 80 VPBROADCAST %esi, %YMMMATCH 81 82 andl $(PAGE_SIZE - 1), %eax 83 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 84 jg L(cross_page_boundary) 85 86L(page_cross_continue): 87 VMOVU (%rdi), %YMM1 88 /* k0 has a 1 for each zero CHAR in YMM1. */ 89 VPTESTN %YMM1, %YMM1, %k0 90 kmovd %k0, %ecx 91 testl %ecx, %ecx 92 jz L(aligned_more) 93 /* fallthrough: zero CHAR in first VEC. */ 94 95 /* K1 has a 1 for each search CHAR match in YMM1. */ 96 VPCMP $0, %YMMMATCH, %YMM1, %k1 97 kmovd %k1, %eax 98 /* Build mask up until first zero CHAR (used to mask of 99 potential search CHAR matches past the end of the string). 100 */ 101 blsmskl %ecx, %ecx 102 andl %ecx, %eax 103 jz L(ret0) 104 /* Get last match (the `andl` removed any out of bounds 105 matches). */ 106 bsrl %eax, %eax 107# ifdef USE_AS_WCSRCHR 108 leaq (%rdi, %rax, CHAR_SIZE), %rax 109# else 110 addq %rdi, %rax 111# endif 112L(ret0): 113 ret 114 115 /* Returns for first vec x1/x2/x3 have hard coded backward 116 search path for earlier matches. */ 117 .p2align 4,, 6 118L(first_vec_x1): 119 VPCMP $0, %YMMMATCH, %YMM2, %k1 120 kmovd %k1, %eax 121 blsmskl %ecx, %ecx 122 /* eax non-zero if search CHAR in range. */ 123 andl %ecx, %eax 124 jnz L(first_vec_x1_return) 125 126 /* fallthrough: no match in YMM2 then need to check for earlier 127 matches (in YMM1). */ 128 .p2align 4,, 4 129L(first_vec_x0_test): 130 VPCMP $0, %YMMMATCH, %YMM1, %k1 131 kmovd %k1, %eax 132 testl %eax, %eax 133 jz L(ret1) 134 bsrl %eax, %eax 135# ifdef USE_AS_WCSRCHR 136 leaq (%rsi, %rax, CHAR_SIZE), %rax 137# else 138 addq %rsi, %rax 139# endif 140L(ret1): 141 ret 142 143 .p2align 4,, 10 144L(first_vec_x1_or_x2): 145 VPCMP $0, %YMM3, %YMMMATCH, %k3 146 VPCMP $0, %YMM2, %YMMMATCH, %k2 147 /* K2 and K3 have 1 for any search CHAR match. Test if any 148 matches between either of them. Otherwise check YMM1. */ 149 kortestd %k2, %k3 150 jz L(first_vec_x0_test) 151 152 /* Guranteed that YMM2 and YMM3 are within range so merge the 153 two bitmasks then get last result. */ 154 kunpck %k2, %k3, %k3 155 kmovq %k3, %rax 156 bsrq %rax, %rax 157 leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax 158 ret 159 160 .p2align 4,, 6 161L(first_vec_x3): 162 VPCMP $0, %YMMMATCH, %YMM4, %k1 163 kmovd %k1, %eax 164 blsmskl %ecx, %ecx 165 /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ 166 andl %ecx, %eax 167 jz L(first_vec_x1_or_x2) 168 bsrl %eax, %eax 169 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 170 ret 171 172 .p2align 4,, 6 173L(first_vec_x0_x1_test): 174 VPCMP $0, %YMMMATCH, %YMM2, %k1 175 kmovd %k1, %eax 176 /* Check YMM2 for last match first. If no match try YMM1. */ 177 testl %eax, %eax 178 jz L(first_vec_x0_test) 179 .p2align 4,, 4 180L(first_vec_x1_return): 181 bsrl %eax, %eax 182 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax 183 ret 184 185 .p2align 4,, 10 186L(first_vec_x2): 187 VPCMP $0, %YMMMATCH, %YMM3, %k1 188 kmovd %k1, %eax 189 blsmskl %ecx, %ecx 190 /* Check YMM3 for last match first. If no match try YMM2/YMM1. 191 */ 192 andl %ecx, %eax 193 jz L(first_vec_x0_x1_test) 194 bsrl %eax, %eax 195 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 196 ret 197 198 199 .p2align 4 200L(aligned_more): 201 /* Need to keep original pointer incase YMM1 has last match. */ 202 movq %rdi, %rsi 203 andq $-VEC_SIZE, %rdi 204 VMOVU VEC_SIZE(%rdi), %YMM2 205 VPTESTN %YMM2, %YMM2, %k0 206 kmovd %k0, %ecx 207 testl %ecx, %ecx 208 jnz L(first_vec_x1) 209 210 VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 211 VPTESTN %YMM3, %YMM3, %k0 212 kmovd %k0, %ecx 213 testl %ecx, %ecx 214 jnz L(first_vec_x2) 215 216 VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 217 VPTESTN %YMM4, %YMM4, %k0 218 kmovd %k0, %ecx 219 movq %rdi, %r8 220 testl %ecx, %ecx 221 jnz L(first_vec_x3) 222 223 andq $-(VEC_SIZE * 2), %rdi 224 .p2align 4 225L(first_aligned_loop): 226 /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee 227 they don't store a match. */ 228 VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 229 VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 230 231 VPCMP $0, %YMM5, %YMMMATCH, %k2 232 vpxord %YMM6, %YMMMATCH, %YMM7 233 234 VPMIN %YMM5, %YMM6, %YMM8 235 VPMIN %YMM8, %YMM7, %YMM7 236 237 VPTESTN %YMM7, %YMM7, %k1 238 subq $(VEC_SIZE * -2), %rdi 239 kortestd %k1, %k2 240 jz L(first_aligned_loop) 241 242 VPCMP $0, %YMM6, %YMMMATCH, %k3 243 VPTESTN %YMM8, %YMM8, %k1 244 ktestd %k1, %k1 245 jz L(second_aligned_loop_prep) 246 247 kortestd %k2, %k3 248 jnz L(return_first_aligned_loop) 249 250 .p2align 4,, 6 251L(first_vec_x1_or_x2_or_x3): 252 VPCMP $0, %YMM4, %YMMMATCH, %k4 253 kmovd %k4, %eax 254 testl %eax, %eax 255 jz L(first_vec_x1_or_x2) 256 bsrl %eax, %eax 257 leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax 258 ret 259 260 .p2align 4,, 8 261L(return_first_aligned_loop): 262 VPTESTN %YMM5, %YMM5, %k0 263 kunpck %k0, %k1, %k0 264 kmov_2x %k0, %maskz_2x 265 266 blsmsk %maskz_2x, %maskz_2x 267 kunpck %k2, %k3, %k3 268 kmov_2x %k3, %maskm_2x 269 and %maskz_2x, %maskm_2x 270 jz L(first_vec_x1_or_x2_or_x3) 271 272 bsr %maskm_2x, %maskm_2x 273 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 274 ret 275 276 .p2align 4 277 /* We can throw away the work done for the first 4x checks here 278 as we have a later match. This is the 'fast' path persay. 279 */ 280L(second_aligned_loop_prep): 281L(second_aligned_loop_set_furthest_match): 282 movq %rdi, %rsi 283 kunpck %k2, %k3, %k4 284 285 .p2align 4 286L(second_aligned_loop): 287 VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 288 VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 289 290 VPCMP $0, %YMM1, %YMMMATCH, %k2 291 vpxord %YMM2, %YMMMATCH, %YMM3 292 293 VPMIN %YMM1, %YMM2, %YMM4 294 VPMIN %YMM3, %YMM4, %YMM3 295 296 VPTESTN %YMM3, %YMM3, %k1 297 subq $(VEC_SIZE * -2), %rdi 298 kortestd %k1, %k2 299 jz L(second_aligned_loop) 300 301 VPCMP $0, %YMM2, %YMMMATCH, %k3 302 VPTESTN %YMM4, %YMM4, %k1 303 ktestd %k1, %k1 304 jz L(second_aligned_loop_set_furthest_match) 305 306 kortestd %k2, %k3 307 /* branch here because there is a significant advantage interms 308 of output dependency chance in using edx. */ 309 jnz L(return_new_match) 310L(return_old_match): 311 kmovq %k4, %rax 312 bsrq %rax, %rax 313 leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax 314 ret 315 316L(return_new_match): 317 VPTESTN %YMM1, %YMM1, %k0 318 kunpck %k0, %k1, %k0 319 kmov_2x %k0, %maskz_2x 320 321 blsmsk %maskz_2x, %maskz_2x 322 kunpck %k2, %k3, %k3 323 kmov_2x %k3, %maskm_2x 324 and %maskz_2x, %maskm_2x 325 jz L(return_old_match) 326 327 bsr %maskm_2x, %maskm_2x 328 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 329 ret 330 331L(cross_page_boundary): 332 /* eax contains all the page offset bits of src (rdi). `xor rdi, 333 rax` sets pointer will all page offset bits cleared so 334 offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC 335 before page cross (guranteed to be safe to read). Doing this 336 as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves 337 a bit of code size. */ 338 xorq %rdi, %rax 339 VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 340 VPTESTN %YMM1, %YMM1, %k0 341 kmovd %k0, %ecx 342 343 /* Shift out zero CHAR matches that are before the begining of 344 src (rdi). */ 345# ifdef USE_AS_WCSRCHR 346 movl %edi, %esi 347 andl $(VEC_SIZE - 1), %esi 348 shrl $2, %esi 349# endif 350 shrxl %SHIFT_REG, %ecx, %ecx 351 352 testl %ecx, %ecx 353 jz L(page_cross_continue) 354 355 /* Found zero CHAR so need to test for search CHAR. */ 356 VPCMP $0, %YMMMATCH, %YMM1, %k1 357 kmovd %k1, %eax 358 /* Shift out search CHAR matches that are before the begining of 359 src (rdi). */ 360 shrxl %SHIFT_REG, %eax, %eax 361 362 /* Check if any search CHAR match in range. */ 363 blsmskl %ecx, %ecx 364 andl %ecx, %eax 365 jz L(ret3) 366 bsrl %eax, %eax 367# ifdef USE_AS_WCSRCHR 368 leaq (%rdi, %rax, CHAR_SIZE), %rax 369# else 370 addq %rdi, %rax 371# endif 372L(ret3): 373 ret 374 375END(STRRCHR) 376#endif 377