1/* Optimized strspn implementation for Power8.
2
3   Copyright (C) 2016-2022 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20/* size_t [r3] strspn (const char *string [r3],
21                       const char *needleAccept [r4])  */
22
23/* This takes a novel approach by computing a 256 bit mask whereby
24   each set bit implies the byte is "accepted".  P8 vector hardware
25   has extremely efficient hardware for selecting bits from a mask.
26
27   One might ask "why not use bpermd for short strings"?  It is
28   so slow that its performance about matches the generic PPC64
29   variant without any fancy masking, with the added expense of
30   making the mask.  That was the first variant of this.  */
31
32
33
34#include "sysdep.h"
35
36#ifndef USE_AS_STRCSPN
37#  define USE_AS_STRCSPN 0
38#  ifndef STRSPN
39#    define STRSPN strspn
40#  endif
41#  define INITIAL_MASK 0
42#  define UPDATE_MASK(RA, RS, RB) or	RA, RS, RB
43#else
44#  ifndef STRSPN
45#    define STRSPN strcspn
46#  endif
47#  define INITIAL_MASK -1
48#  define UPDATE_MASK(RA, RS, RB) andc	RA, RS, RB
49#endif
50
51/* Simple macro to use VSX instructions in overlapping VR's.  */
52#define XXVR(insn, vrt, vra, vrb) \
53	insn 32+vrt, 32+vra, 32+vrb
54
55	.machine  power8
56ENTRY_TOCLESS (STRSPN, 4)
57	CALL_MCOUNT 2
58
59	/* Generate useful constants for later on.  */
60	vspltisb v1, 7
61	vspltisb v2, -1
62	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
63	vspltisb v10, 0
64	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
65	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
66
67	/* Prepare to compute 256b mask.  */
68	addi	r4, r4, -1
69	li	r5, INITIAL_MASK
70	li	r6, INITIAL_MASK
71	li	r7, INITIAL_MASK
72	li	r8, INITIAL_MASK
73
74#if USE_AS_STRCSPN
75	/* Ensure the null character never matches by clearing ISA bit 0 in
76	   in r5 which is the bit which will check for it in the later usage
77	   of vbpermq.  */
78	srdi	r5, r5, 1
79#endif
80
81	li	r11, 1
82	sldi	r11, r11, 63
83
84	/* Start interleaved Mask computation.
85	   This will eventually or 1's into ignored bits from vbpermq.  */
86	lvsr	v11, 0, r3
87	vspltb  v11, v11, 0	/* Splat shift constant.  */
88
89	/* Build a 256b mask in r5-r8.  */
90	.align 4
91L(next_needle):
92	lbzu	r9, 1(r4)
93
94	cmpldi	cr0, r9, 0
95	cmpldi	cr1, r9, 128
96
97	/* This is a little tricky.  srd only uses the first 7 bits,
98	   and if bit 7 is set, value is always 0.  So, we can
99	   effectively shift 128b in this case.  */
100	xori	r12, r9,  0x40	/* Invert bit 6.  */
101	srd	r10, r11, r9	/* Mask for bits 0-63.  */
102	srd	r12, r11, r12	/* Mask for bits 64-127.  */
103
104	beq	cr0, L(start_cmp)
105
106	/* Now, or the value into the correct GPR.  */
107	bge cr1,L(needle_gt128)
108	UPDATE_MASK (r5, r5, r10)	/* 0 - 63.  */
109	UPDATE_MASK (r6, r6, r12)	/* 64 - 127.  */
110	b L(next_needle)
111
112	.align 4
113L(needle_gt128):
114	UPDATE_MASK (r7, r7, r10)	/* 128 - 191.  */
115	UPDATE_MASK (r8, r8, r12)	/* 192 - 255.  */
116	b L(next_needle)
117
118
119	.align 4
120L(start_cmp):
121	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
122	mr	r0, r3		/* Save r3 for final length computation.  */
123	mtvrd	v5, r5
124	mtvrd	v6, r6
125	mtvrd	v7, r7
126	mtvrd	v8, r8
127
128	/* Continue interleaved mask generation.  */
129#ifdef __LITTLE_ENDIAN__
130	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
131	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
132#else
133	vslw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
134	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
135#endif
136	lvx	v0, 0, r3	/* Note, unaligned load ignores lower bits.  */
137
138	/* Do the merging of the bitmask.  */
139	XXVR(xxmrghd, v5, v5, v6)
140	XXVR(xxmrghd, v6, v7, v8)
141
142	/* Finish mask generation.  */
143	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */
144
145	/* Compare the first 1-16B, while masking unwanted bytes.  */
146	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
147	vxor	v9, v0, v1	/* Swap high bit.  */
148	vbpermq	v8, v5, v0
149	vbpermq	v7, v6, v9
150	vor	v7, v7, v8
151	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
152	vcmpequh. v8, v7, v4
153	bnl	cr6, L(done)
154
155	addi	r3, r3, 16
156
157	.align 4
158L(vec):
159	lvx	v0, 0, r3
160	addi	r3, r3, 16
161	vxor	v9, v0, v1	/* Swap high bit.  */
162	vbpermq	v8, v5, v0
163	vbpermq	v7, v6, v9
164	vor	v7, v7, v8
165	vcmpequh. v8, v7, v4
166	blt	cr6, L(vec)
167
168	addi	r3, r3, -16
169L(done):
170	subf	r3, r0, r3
171	mfvrd	r10, v7
172
173#ifdef __LITTLE_ENDIAN__
174	addi	r0,  r10, 1	/* Count the trailing 1's.  */
175	andc	r10, r10, r0
176	popcntd	r10, r10
177#else
178	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
179	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
180	cntlzd  r10, r10
181#endif
182
183	add	r3, r3, r10
184	blr
185
186END(STRSPN)
187libc_hidden_builtin_def (STRSPN)
188