1/* memchr implemented using NEON. 2 Copyright (C) 2011-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* For __ARM_NEON__ this file defines memchr. */ 22#ifndef __ARM_NEON__ 23# define memchr __memchr_neon 24# undef libc_hidden_builtin_def 25# define libc_hidden_builtin_def(a) 26#endif 27 28 .arch armv7-a 29 .fpu neon 30 31 32/* Arguments */ 33#define srcin r0 34#define chrin r1 35#define cntin r2 36 37/* Retval */ 38#define result r0 /* Live range does not overlap with srcin */ 39 40/* Working registers */ 41#define src r1 /* Live range does not overlap with chrin */ 42#define tmp r3 43#define synd r0 /* No overlap with srcin or result */ 44#define soff r12 45 46/* Working NEON registers */ 47#define vrepchr q0 48#define vdata0 q1 49#define vdata0_0 d2 /* Lower half of vdata0 */ 50#define vdata0_1 d3 /* Upper half of vdata0 */ 51#define vdata1 q2 52#define vdata1_0 d4 /* Lower half of vhas_chr0 */ 53#define vdata1_1 d5 /* Upper half of vhas_chr0 */ 54#define vrepmask q3 55#define vrepmask0 d6 56#define vrepmask1 d7 57#define vend q4 58#define vend0 d8 59#define vend1 d9 60 61/* 62 * Core algorithm: 63 * 64 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per 65 * byte. Each bit is set if the relevant byte matched the requested character 66 * and cleared otherwise. Since the bits in the syndrome reflect exactly the 67 * order in which things occur in the original string, counting trailing zeros 68 * allows to identify exactly which byte has matched. 69 */ 70 71 .thumb_func 72 .p2align 4,,15 73 74ENTRY(memchr) 75 /* Use a simple loop if there are less than 8 bytes to search. */ 76 cmp cntin, #7 77 bhi .Llargestr 78 and chrin, chrin, #0xff 79 80.Lsmallstr: 81 subs cntin, cntin, #1 82 blo .Lnotfound /* Return not found if reached end. */ 83 ldrb tmp, [srcin], #1 84 cmp tmp, chrin 85 bne .Lsmallstr /* Loop again if not found. */ 86 /* Otherwise fixup address and return. */ 87 sub result, srcin, #1 88 bx lr 89 90 91.Llargestr: 92 vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ 93 /* 94 * Magic constant 0x8040201008040201 allows us to identify which lane 95 * matches the requested byte. 96 */ 97 movw tmp, #0x0201 98 movt tmp, #0x0804 99 lsl soff, tmp, #4 100 vmov vrepmask0, tmp, soff 101 vmov vrepmask1, tmp, soff 102 /* Work with aligned 32-byte chunks */ 103 bic src, srcin, #31 104 ands soff, srcin, #31 105 beq .Lloopintro /* Go straight to main loop if it's aligned. */ 106 107 /* 108 * Input string is not 32-byte aligned. We calculate the syndrome 109 * value for the aligned 32 bytes block containing the first bytes 110 * and mask the irrelevant part. 111 */ 112 vld1.8 {vdata0, vdata1}, [src:256]! 113 sub tmp, soff, #32 114 adds cntin, cntin, tmp 115 vceq.i8 vdata0, vdata0, vrepchr 116 vceq.i8 vdata1, vdata1, vrepchr 117 vand vdata0, vdata0, vrepmask 118 vand vdata1, vdata1, vrepmask 119 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 120 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 121 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 122 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 123 vmov synd, vdata0_0[0] 124 125 /* Clear the soff lower bits */ 126 lsr synd, synd, soff 127 lsl synd, synd, soff 128 /* The first block can also be the last */ 129 bls .Lmasklast 130 /* Have we found something already? */ 131 cbnz synd, .Ltail 132 133 134.Lloopintro: 135 vpush {vend} 136 /* 264/265 correspond to d8/d9 for q4 */ 137 cfi_adjust_cfa_offset (16) 138 cfi_rel_offset (264, 0) 139 cfi_rel_offset (265, 8) 140 .p2align 3,,7 141.Lloop: 142 vld1.8 {vdata0, vdata1}, [src:256]! 143 subs cntin, cntin, #32 144 vceq.i8 vdata0, vdata0, vrepchr 145 vceq.i8 vdata1, vdata1, vrepchr 146 /* If we're out of data we finish regardless of the result. */ 147 bls .Lend 148 /* Use a fast check for the termination condition. */ 149 vorr vend, vdata0, vdata1 150 vorr vend0, vend0, vend1 151 vmov synd, tmp, vend0 152 orrs synd, synd, tmp 153 /* We're not out of data, loop if we haven't found the character. */ 154 beq .Lloop 155 156.Lend: 157 vpop {vend} 158 cfi_adjust_cfa_offset (-16) 159 cfi_restore (264) 160 cfi_restore (265) 161 162 /* Termination condition found, let's calculate the syndrome value. */ 163 vand vdata0, vdata0, vrepmask 164 vand vdata1, vdata1, vrepmask 165 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 166 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 167 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 168 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 169 vmov synd, vdata0_0[0] 170 cbz synd, .Lnotfound 171 bhi .Ltail /* Uses the condition code from 172 subs cntin, cntin, #32 above. */ 173 174 175.Lmasklast: 176 /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ 177 neg cntin, cntin 178 lsl synd, synd, cntin 179 lsrs synd, synd, cntin 180 it eq 181 moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ 182 183 184.Ltail: 185 /* Count the trailing zeros using bit reversing */ 186 rbit synd, synd 187 /* Compensate the last post-increment */ 188 sub src, src, #32 189 /* Count the leading zeros */ 190 clz synd, synd 191 /* Compute the potential result and return */ 192 add result, src, synd 193 bx lr 194 195 196.Lnotfound: 197 /* Set result to NULL if not found and return */ 198 mov result, #0 199 bx lr 200 201END(memchr) 202libc_hidden_builtin_def (memchr) 203