1 /* strcspn with SSE4.2 intrinsics
2    Copyright (C) 2009-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <isa-level.h>
20 #if IS_IN (libc) || MINIMUM_X86_ISA_LEVEL >= 2
21 
22 # include <nmmintrin.h>
23 # include <string.h>
24 # include "varshift.h"
25 
26 /* We use 0x2:
27 	_SIDD_SBYTE_OPS
28 	| _SIDD_CMP_EQUAL_ANY
29 	| _SIDD_POSITIVE_POLARITY
30 	| _SIDD_LEAST_SIGNIFICANT
31    on pcmpistri to compare xmm/mem128
32 
33    0 1 2 3 4 5 6 7 8 9 A B C D E F
34    X X X X X X X X X X X X X X X X
35 
36    against xmm
37 
38    0 1 2 3 4 5 6 7 8 9 A B C D E F
39    A A A A A A A A A A A A A A A A
40 
41    to find out if the first 16byte data element has any byte A and
42    the offset of the first byte.  There are 3 cases:
43 
44    1. The first 16byte data element has the byte A at the offset X.
45    2. The first 16byte data element has EOS and doesn't have the byte A.
46    3. The first 16byte data element is valid and doesn't have the byte A.
47 
48    Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
49 
50     1		 X	  1	 0/1	  0
51     2		16	  0	  1	  0
52     3		16	  0	  0	  0
53 
54    We exit from the loop for cases 1 and 2 with jbe which branches
55    when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
56    X for case 1.  */
57 
58 # ifndef STRCSPN
59 #  define STRCSPN __strcspn_sse42
60 # endif
61 # ifndef STRCSPN_GENERIC
62 #  define STRCSPN_GENERIC __strcspn_generic
63 # endif
64 
65 # ifdef USE_AS_STRPBRK
66 #  define RETURN(val1, val2) return val1
67 # else
68 #  define RETURN(val1, val2) return val2
69 # endif
70 
71 extern
72 # ifdef USE_AS_STRPBRK
73 char *
74 # else
75 size_t
76 # endif
77 STRCSPN_GENERIC (const char *, const char *) attribute_hidden;
78 
79 
80 # ifdef USE_AS_STRPBRK
81 char *
82 # else
83 size_t
84 # endif
85 __attribute__ ((section (".text.sse4.2")))
STRCSPN(const char * s,const char * a)86 STRCSPN (const char *s, const char *a)
87 {
88   if (*a == 0)
89     RETURN (NULL, strlen (s));
90 
91   const char *aligned;
92   __m128i mask, maskz, zero;
93   unsigned int maskz_bits;
94   unsigned int offset = (unsigned int) ((size_t) a & 15);
95   zero = _mm_set1_epi8 (0);
96   if (offset != 0)
97     {
98       /* Load masks.  */
99       aligned = (const char *) ((size_t) a & -16L);
100       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
101       maskz = _mm_cmpeq_epi8 (mask0, zero);
102 
103       /* Find where the NULL terminator is.  */
104       maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
105       if (maskz_bits != 0)
106         {
107           mask = __m128i_shift_right (mask0, offset);
108           offset = (unsigned int) ((size_t) s & 15);
109           if (offset)
110             goto start_unaligned;
111 
112           aligned = s;
113           goto start_loop;
114         }
115     }
116 
117   /* A is aligned.  */
118   mask = _mm_loadu_si128 ((__m128i *) a);
119   /* Find where the NULL terminator is.  */
120   maskz = _mm_cmpeq_epi8 (mask, zero);
121   maskz_bits = _mm_movemask_epi8 (maskz);
122   if (maskz_bits == 0)
123     {
124       /* There is no NULL terminator.  Don't use SSE4.2 if the length
125          of A > 16.  */
126       if (a[16] != 0)
127         return STRCSPN_GENERIC (s, a);
128     }
129 
130   aligned = s;
131   offset = (unsigned int) ((size_t) s & 15);
132   if (offset != 0)
133     {
134     start_unaligned:
135       /* Check partial string.  */
136       aligned = (const char *) ((size_t) s & -16L);
137       __m128i value = _mm_load_si128 ((__m128i *) aligned);
138 
139       value = __m128i_shift_right (value, offset);
140 
141       unsigned int length = _mm_cmpistri (mask, value, 0x2);
142       /* No need to check ZFlag since ZFlag is always 1.  */
143       unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
144       if (cflag)
145 	RETURN ((char *) (s + length), length);
146       /* Find where the NULL terminator is.  */
147       unsigned int index = _mm_cmpistri (value, value, 0x3a);
148       if (index < 16 - offset)
149 	RETURN (NULL, index);
150       aligned += 16;
151     }
152 
153 start_loop:
154   while (1)
155     {
156       __m128i value = _mm_load_si128 ((__m128i *) aligned);
157       unsigned int index = _mm_cmpistri (mask, value, 0x2);
158       unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
159       unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
160       if (cflag)
161 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
162       if (zflag)
163 	RETURN (NULL,
164 		/* Find where the NULL terminator is.  */
165 		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
166       aligned += 16;
167     }
168 }
169 #endif
170