1 /* strcspn with SSE4.2 intrinsics
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <isa-level.h>
20 #if IS_IN (libc) || MINIMUM_X86_ISA_LEVEL >= 2
21
22 # include <nmmintrin.h>
23 # include <string.h>
24 # include "varshift.h"
25
26 /* We use 0x2:
27 _SIDD_SBYTE_OPS
28 | _SIDD_CMP_EQUAL_ANY
29 | _SIDD_POSITIVE_POLARITY
30 | _SIDD_LEAST_SIGNIFICANT
31 on pcmpistri to compare xmm/mem128
32
33 0 1 2 3 4 5 6 7 8 9 A B C D E F
34 X X X X X X X X X X X X X X X X
35
36 against xmm
37
38 0 1 2 3 4 5 6 7 8 9 A B C D E F
39 A A A A A A A A A A A A A A A A
40
41 to find out if the first 16byte data element has any byte A and
42 the offset of the first byte. There are 3 cases:
43
44 1. The first 16byte data element has the byte A at the offset X.
45 2. The first 16byte data element has EOS and doesn't have the byte A.
46 3. The first 16byte data element is valid and doesn't have the byte A.
47
48 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
49
50 1 X 1 0/1 0
51 2 16 0 1 0
52 3 16 0 0 0
53
54 We exit from the loop for cases 1 and 2 with jbe which branches
55 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
56 X for case 1. */
57
58 # ifndef STRCSPN
59 # define STRCSPN __strcspn_sse42
60 # endif
61 # ifndef STRCSPN_GENERIC
62 # define STRCSPN_GENERIC __strcspn_generic
63 # endif
64
65 # ifdef USE_AS_STRPBRK
66 # define RETURN(val1, val2) return val1
67 # else
68 # define RETURN(val1, val2) return val2
69 # endif
70
71 extern
72 # ifdef USE_AS_STRPBRK
73 char *
74 # else
75 size_t
76 # endif
77 STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
78
79
80 # ifdef USE_AS_STRPBRK
81 char *
82 # else
83 size_t
84 # endif
85 __attribute__ ((section (".text.sse4.2")))
STRCSPN(const char * s,const char * a)86 STRCSPN (const char *s, const char *a)
87 {
88 if (*a == 0)
89 RETURN (NULL, strlen (s));
90
91 const char *aligned;
92 __m128i mask, maskz, zero;
93 unsigned int maskz_bits;
94 unsigned int offset = (unsigned int) ((size_t) a & 15);
95 zero = _mm_set1_epi8 (0);
96 if (offset != 0)
97 {
98 /* Load masks. */
99 aligned = (const char *) ((size_t) a & -16L);
100 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
101 maskz = _mm_cmpeq_epi8 (mask0, zero);
102
103 /* Find where the NULL terminator is. */
104 maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
105 if (maskz_bits != 0)
106 {
107 mask = __m128i_shift_right (mask0, offset);
108 offset = (unsigned int) ((size_t) s & 15);
109 if (offset)
110 goto start_unaligned;
111
112 aligned = s;
113 goto start_loop;
114 }
115 }
116
117 /* A is aligned. */
118 mask = _mm_loadu_si128 ((__m128i *) a);
119 /* Find where the NULL terminator is. */
120 maskz = _mm_cmpeq_epi8 (mask, zero);
121 maskz_bits = _mm_movemask_epi8 (maskz);
122 if (maskz_bits == 0)
123 {
124 /* There is no NULL terminator. Don't use SSE4.2 if the length
125 of A > 16. */
126 if (a[16] != 0)
127 return STRCSPN_GENERIC (s, a);
128 }
129
130 aligned = s;
131 offset = (unsigned int) ((size_t) s & 15);
132 if (offset != 0)
133 {
134 start_unaligned:
135 /* Check partial string. */
136 aligned = (const char *) ((size_t) s & -16L);
137 __m128i value = _mm_load_si128 ((__m128i *) aligned);
138
139 value = __m128i_shift_right (value, offset);
140
141 unsigned int length = _mm_cmpistri (mask, value, 0x2);
142 /* No need to check ZFlag since ZFlag is always 1. */
143 unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
144 if (cflag)
145 RETURN ((char *) (s + length), length);
146 /* Find where the NULL terminator is. */
147 unsigned int index = _mm_cmpistri (value, value, 0x3a);
148 if (index < 16 - offset)
149 RETURN (NULL, index);
150 aligned += 16;
151 }
152
153 start_loop:
154 while (1)
155 {
156 __m128i value = _mm_load_si128 ((__m128i *) aligned);
157 unsigned int index = _mm_cmpistri (mask, value, 0x2);
158 unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
159 unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
160 if (cflag)
161 RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
162 if (zflag)
163 RETURN (NULL,
164 /* Find where the NULL terminator is. */
165 (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
166 aligned += 16;
167 }
168 }
169 #endif
170