1/* Vector optimized 32/64 bit S/390 version of wcsrchr.
2   Copyright (C) 2015-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <ifunc-wcsrchr.h>
20#if HAVE_WCSRCHR_Z13
21
22# include "sysdep.h"
23# include "asm-syntax.h"
24
25	.text
26
27/* wchar_t *wcsrchr (const wchar_t *s, wchar_t c)
28   Locate the last character c in string.
29
30   Register usage:
31   -r0=loaded bytes in first part of s.
32   -r1=pointer to last occurence of c or NULL if not found.
33   -r2=s
34   -r3=c
35   -r4=tmp
36   -r5=current_len
37   -v16=part of s
38   -v17=index of found element
39   -v18=replicated c
40   -v19=part of s with last occurence of c.
41   -v20=permute pattern
42*/
43ENTRY(WCSRCHR_Z13)
44	.machine "z13"
45	.machinemode "zarch_nohighgprs"
46
47	vlbb	%v16,0(%r2),6	/* Load s until next 4k-byte boundary.  */
48	lcbb	%r0,0(%r2),6	/* Get bytes to 4k-byte boundary or 16.  */
49
50	tmll	%r2,3		/* Test if s is 4-byte aligned?   */
51	jne	.Lfallback	/* And use common-code variant if not.  */
52
53	vlvgf	%v18,%r3,0	/* Generate vector which elements are all c.  */
54	vrepf	%v18,%v18,0
55
56	lghi	%r1,-1		/* Currently no c found.  */
57	lghi	%r5,0		/* current_len = 0.  */
58
59	vfeezfs	%v17,%v16,%v18	/* Find element equal or zero.  */
60	vlgvb	%r4,%v17,7	/* Load byte index of c/zero or 16.  */
61	clrjl	%r4,%r0,.Lfound_first_part /* Found c/zero in loaded bytes.  */
62.Lalign:
63	/* Align s to 16 byte.  */
64	risbgn	%r4,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15.  */
65	lghi	%r5,16		/* current_len = 16.  */
66	slr	%r5,%r4		/* Compute bytes to 16bytes boundary.  */
67
68.Lloop:
69	vl	%v16,0(%r5,%r2) /* Load s.  */
70	vfeezfs	%v17,%v16,%v18	/* Find element equal with zero search.  */
71	jno	.Lfound		/* Found c/zero (cc=0|1|2).  */
72	vl	%v16,16(%r5,%r2)
73	vfeezfs	%v17,%v16,%v18
74	jno	.Lfound16
75	vl	%v16,32(%r5,%r2)
76	vfeezfs	%v17,%v16,%v18
77	jno	.Lfound32
78	vl	%v16,48(%r5,%r2)
79	vfeezfs	%v17,%v16,%v18
80	jno	.Lfound48
81
82	aghi	%r5,64
83	j	.Lloop		/* No character and no zero -> loop.  */
84
85.Lfound48:
86	la	%r5,16(%r5)	/* Use la since aghi would clobber cc.  */
87.Lfound32:
88	la	%r5,16(%r5)
89.Lfound16:
90	la	%r5,16(%r5)
91.Lfound:
92	je	.Lzero		/* Found zero, but no c before that zero.  */
93	/* Save this part of s to check for further matches after reaching
94	   the end of the complete string.  */
95	vlr	%v19,%v16
96	lgr	%r1,%r5
97
98	jh	.Lzero		/* Found a zero after the found c.  */
99	aghi	%r5,16		/* Start search of next part of s.  */
100	j	.Lloop
101
102.Lfound_first_part:
103	/* This code is only executed if the found c/zero is whithin loaded
104	   bytes. If no c/zero was found (cc==3) the found index = 16, thus
105	   this code is not called.
106	   Resulting condition code of vector find element equal:
107	   cc==0: no c, found zero
108	   cc==1: c found, no zero
109	   cc==2: c found, found zero after c
110	   cc==3: no c, no zero (this case can be ignored).  */
111	je	.Lzero		/* Found zero, but no c before that zero.  */
112
113	locgrne	%r1,%r5		/* Mark c as found in first part of s.  */
114	vlr	%v19,%v16
115
116	jl	.Lalign		/* No zero (e.g. if vr was fully loaded)
117				   -> Align and loop afterwards.  */
118
119	/* Found a zero in vr. If vr was not fully loaded due to block
120	   boundary, the remaining bytes are filled with zero and we can't
121	   rely on zero indication of condition code here!  */
122
123	vfenezf	%v17,%v16,%v16
124	vlgvb	%r4,%v17,7	/* Load byte index of zero or 16.  */
125	clrjl	%r4,%r0,.Lzero	/* Zero within loaded bytes -> end.  */
126	j	.Lalign		/* Align and loop afterwards.  */
127
128.Lend_searched_zero:
129	vlgvb	%r4,%v17,7	/* Load byte index of zero.  */
130	algr	%r5,%r4
131	la	%r2,0(%r5,%r2)	/* Return pointer to zero.  */
132	br	%r14
133
134.Lzero:
135	/* Reached end of string. Check if one c was found before.  */
136	clije	%r3,0,.Lend_searched_zero /* Found zero and c is zero.  */
137
138	cgfi	%r1,-1		/* No c found -> return NULL.  */
139	locghie	%r2,0
140	ber	%r14
141
142	larl	%r3,.Lpermute_mask /* Load permute mask.  */
143	vl	%v20,0(%r3)
144
145	/* c was found and is part of v19.  */
146	vfenezf	%v17,%v19,%v19	/* Find zero.  */
147	vlgvb	%r4,%v17,7	/* Load byte index of zero or 16.  */
148	ahi	%r4,3		/* Found zero index is first byte,
149				   thus highest byte index is last byte of
150				   wchar_t zero.  */
151
152	clgfi	%r5,0		/* Loaded byte count in v19 is 16, ...  */
153	lochine	%r0,16		/* ... if v19 is not the first part of s.  */
154	ahi	%r0,-1		/* Convert byte count to highest index.  */
155
156	clr	%r0,%r4
157	locrl	%r4,%r0		/* r4 = min (zero-index, highest-index).  */
158
159	/* Right-shift of v19 to mask bytes after zero.  */
160	clije	%r4,15,.Lzero_permute /* No shift is needed if highest index
161					 in vr is 15.  */
162	lhi	%r0,15
163	slr	%r0,%r4		/* Compute byte count for vector shift left.  */
164	sll	%r0,3		/* Convert to bit count.  */
165	vlvgb	%v17,%r0,7
166	vsrlb	%v19,%v19,%v17	/* Vector shift right by byte by number of bytes
167				   specified in bits 1-4 of byte 7 in v17.   */
168
169	/* Reverse bytes in v19.  */
170.Lzero_permute:
171	vperm	%v19,%v19,%v19,%v20 /* Permute v19 to reversed order.  */
172
173	/* Find c in reversed v19.  */
174	vfeef	%v19,%v19,%v18	/* Find c.  */
175	la	%r2,0(%r1,%r2)
176	vlgvb	%r3,%v19,7	/* Load byte index of c.  */
177
178	/* Compute index in real s and return.  */
179	slgr	%r4,%r3
180	lay	%r2,-3(%r4,%r2)	/* Return pointer to zero. -3 is needed,
181				   because the found byte index is reversed in
182				   vector-register. Thus point to first byte of
183				   wchar_t.  */
184	br	%r14
185.Lpermute_mask:
186	.byte	0x0C,0x0D,0x0E,0x0F,0x08,0x09,0x0A,0x0B
187	.byte	0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03
188.Lfallback:
189	jg	WCSRCHR_C
190END(WCSRCHR_Z13)
191
192# if ! HAVE_WCSRCHR_IFUNC
193strong_alias (WCSRCHR_Z13, wcsrchr)
194# endif
195#endif
196