1/* wcscpy with SSSE3
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 4 because there are not V3/V4
22   implementations so we need this to build for ISA V3/V4
23    builds. */
24#if ISA_SHOULD_BUILD (4)
25
26# ifndef WCSCPY
27#  define WCSCPY	__wcscpy_ssse3
28# endif
29
30# include <sysdep.h>
31
32	.section .text.ssse3,"ax",@progbits
33ENTRY (WCSCPY)
34
35	mov	%rsi, %rcx
36	mov	%rdi, %rdx
37
38	cmpl	$0, (%rcx)
39	jz	L(Exit4)
40	cmpl	$0, 4(%rcx)
41	jz	L(Exit8)
42	cmpl	$0, 8(%rcx)
43	jz	L(Exit12)
44	cmpl	$0, 12(%rcx)
45	jz	L(Exit16)
46
47	lea	16(%rcx), %rsi
48	and	$-16, %rsi
49
50	pxor	%xmm0, %xmm0
51	mov	(%rcx), %r9
52	mov	%r9, (%rdx)
53
54	pcmpeqd	(%rsi), %xmm0
55	mov	8(%rcx), %r9
56	mov	%r9, 8(%rdx)
57
58	pmovmskb %xmm0, %rax
59	sub	%rcx, %rsi
60
61	test	%rax, %rax
62	jnz	L(CopyFrom1To16Bytes)
63
64	mov	%rdx, %rax
65	addq	$16, %rdx
66	and	$-16, %rdx
67	sub	%rdx, %rax
68	sub	%rax, %rcx
69	mov	%rcx, %rax
70	and	$0xf, %rax
71	mov	$0, %rsi
72
73/* case: rcx_offset == rdx_offset */
74
75	jz	L(Align16Both)
76
77	cmp	$4, %rax
78	je	L(Shl4)
79	cmp	$8, %rax
80	je	L(Shl8)
81	jmp	L(Shl12)
82
83L(Align16Both):
84	movaps	(%rcx), %xmm1
85	movaps	16(%rcx), %xmm2
86	movaps	%xmm1, (%rdx)
87	pcmpeqd	%xmm2, %xmm0
88	pmovmskb %xmm0, %eax
89	addq	$16, %rsi
90
91	test	%eax, %eax
92	jnz	L(CopyFrom1To16Bytes)
93
94	movaps	16(%rcx, %rsi), %xmm3
95	movaps	%xmm2, (%rdx, %rsi)
96	pcmpeqd	%xmm3, %xmm0
97	pmovmskb %xmm0, %eax
98	addq	$16, %rsi
99
100	test	%eax, %eax
101	jnz	L(CopyFrom1To16Bytes)
102
103	movaps	16(%rcx, %rsi), %xmm4
104	movaps	%xmm3, (%rdx, %rsi)
105	pcmpeqd	%xmm4, %xmm0
106	pmovmskb %xmm0, %eax
107	addq	$16, %rsi
108
109	test	%eax, %eax
110	jnz	L(CopyFrom1To16Bytes)
111
112	movaps	16(%rcx, %rsi), %xmm1
113	movaps	%xmm4, (%rdx, %rsi)
114	pcmpeqd	%xmm1, %xmm0
115	pmovmskb %xmm0, %eax
116	addq	$16, %rsi
117
118	test	%eax, %eax
119	jnz	L(CopyFrom1To16Bytes)
120
121	movaps	16(%rcx, %rsi), %xmm2
122	movaps	%xmm1, (%rdx, %rsi)
123	pcmpeqd	%xmm2, %xmm0
124	pmovmskb %xmm0, %eax
125	addq	$16, %rsi
126
127	test	%eax, %eax
128	jnz	L(CopyFrom1To16Bytes)
129
130	movaps	16(%rcx, %rsi), %xmm3
131	movaps	%xmm2, (%rdx, %rsi)
132	pcmpeqd	%xmm3, %xmm0
133	pmovmskb %xmm0, %eax
134	addq	$16, %rsi
135
136	test	%eax, %eax
137	jnz	L(CopyFrom1To16Bytes)
138
139	movaps	%xmm3, (%rdx, %rsi)
140	mov	%rcx, %rax
141	lea	16(%rcx, %rsi), %rcx
142	and	$-0x40, %rcx
143	sub	%rcx, %rax
144	sub	%rax, %rdx
145
146	mov	$-0x40, %rsi
147
148	.p2align 4
149L(Aligned64Loop):
150	movaps	(%rcx), %xmm2
151	movaps	%xmm2, %xmm4
152	movaps	16(%rcx), %xmm5
153	movaps	32(%rcx), %xmm3
154	movaps	%xmm3, %xmm6
155	movaps	48(%rcx), %xmm7
156	pminub	%xmm5, %xmm2
157	pminub	%xmm7, %xmm3
158	pminub	%xmm2, %xmm3
159	pcmpeqd	%xmm0, %xmm3
160	pmovmskb %xmm3, %eax
161	addq	$64, %rdx
162	addq	$64, %rcx
163	testl	%eax, %eax
164	jnz	L(Aligned64Leave)
165	movaps	%xmm4, -64(%rdx)
166	movaps	%xmm5, -48(%rdx)
167	movaps	%xmm6, -32(%rdx)
168	movaps	%xmm7, -16(%rdx)
169	jmp	L(Aligned64Loop)
170
171L(Aligned64Leave):
172	pcmpeqd	%xmm4, %xmm0
173	pmovmskb %xmm0, %eax
174	test	%eax, %eax
175	jnz	L(CopyFrom1To16Bytes)
176
177	pcmpeqd	%xmm5, %xmm0
178
179	pmovmskb %xmm0, %eax
180	movaps	%xmm4, -64(%rdx)
181	addq	$16, %rsi
182	test	%eax, %eax
183	jnz	L(CopyFrom1To16Bytes)
184
185	pcmpeqd	%xmm6, %xmm0
186
187	pmovmskb %xmm0, %eax
188	movaps	%xmm5, -48(%rdx)
189	addq	$16, %rsi
190	test	%eax, %eax
191	jnz	L(CopyFrom1To16Bytes)
192
193	movaps	%xmm6, -32(%rdx)
194	pcmpeqd	%xmm7, %xmm0
195
196	pmovmskb %xmm0, %eax
197	addq	$16, %rsi
198	test	%eax, %eax
199	jnz	L(CopyFrom1To16Bytes)
200
201	mov	$-0x40, %rsi
202	movaps	%xmm7, -16(%rdx)
203	jmp	L(Aligned64Loop)
204
205	.p2align 4
206L(Shl4):
207	movaps	-4(%rcx), %xmm1
208	movaps	12(%rcx), %xmm2
209L(Shl4Start):
210	pcmpeqd	%xmm2, %xmm0
211	pmovmskb %xmm0, %eax
212	movaps	%xmm2, %xmm3
213
214	test	%eax, %eax
215	jnz	L(Shl4LoopExit)
216
217	palignr	$4, %xmm1, %xmm2
218	movaps	%xmm2, (%rdx)
219	movaps	28(%rcx), %xmm2
220
221	pcmpeqd	%xmm2, %xmm0
222	addq	$16, %rdx
223	pmovmskb %xmm0, %eax
224	addq	$16, %rcx
225	movaps	%xmm2, %xmm1
226
227	test	%eax, %eax
228	jnz	L(Shl4LoopExit)
229
230	palignr	$4, %xmm3, %xmm2
231	movaps	%xmm2, (%rdx)
232	movaps	28(%rcx), %xmm2
233
234	pcmpeqd	%xmm2, %xmm0
235	addq	$16, %rdx
236	pmovmskb %xmm0, %eax
237	addq	$16, %rcx
238	movaps	%xmm2, %xmm3
239
240	test	%eax, %eax
241	jnz	L(Shl4LoopExit)
242
243	palignr	$4, %xmm1, %xmm2
244	movaps	%xmm2, (%rdx)
245	movaps	28(%rcx), %xmm2
246
247	pcmpeqd	%xmm2, %xmm0
248	addq	$16, %rdx
249	pmovmskb %xmm0, %eax
250	addq	$16, %rcx
251
252	test	%eax, %eax
253	jnz	L(Shl4LoopExit)
254
255	palignr	$4, %xmm3, %xmm2
256	movaps	%xmm2, (%rdx)
257	addq	$28, %rcx
258	addq	$16, %rdx
259
260	mov	%rcx, %rax
261	and	$-0x40, %rcx
262	sub	%rcx, %rax
263	addq	$-12, %rcx
264	sub	%rax, %rdx
265
266	movaps	-4(%rcx), %xmm1
267
268	.p2align 4
269L(Shl4LoopStart):
270	movaps	12(%rcx), %xmm2
271	movaps	28(%rcx), %xmm3
272	movaps	%xmm3, %xmm6
273	movaps	44(%rcx), %xmm4
274	movaps	%xmm4, %xmm7
275	movaps	60(%rcx), %xmm5
276	pminub	%xmm2, %xmm6
277	pminub	%xmm5, %xmm7
278	pminub	%xmm6, %xmm7
279	pcmpeqd	%xmm0, %xmm7
280	pmovmskb %xmm7, %eax
281	movaps	%xmm5, %xmm7
282	palignr	$4, %xmm4, %xmm5
283	palignr	$4, %xmm3, %xmm4
284	test	%eax, %eax
285	jnz	L(Shl4Start)
286
287	palignr	$4, %xmm2, %xmm3
288	addq	$64, %rcx
289	palignr	$4, %xmm1, %xmm2
290	movaps	%xmm7, %xmm1
291	movaps	%xmm5, 48(%rdx)
292	movaps	%xmm4, 32(%rdx)
293	movaps	%xmm3, 16(%rdx)
294	movaps	%xmm2, (%rdx)
295	addq	$64, %rdx
296	jmp	L(Shl4LoopStart)
297
298L(Shl4LoopExit):
299	movdqu	-4(%rcx), %xmm1
300	mov	$12, %rsi
301	movdqu	%xmm1, -4(%rdx)
302	jmp	L(CopyFrom1To16Bytes)
303
304	.p2align 4
305L(Shl8):
306	movaps	-8(%rcx), %xmm1
307	movaps	8(%rcx), %xmm2
308L(Shl8Start):
309	pcmpeqd	%xmm2, %xmm0
310	pmovmskb %xmm0, %eax
311	movaps	%xmm2, %xmm3
312
313	test	%eax, %eax
314	jnz	L(Shl8LoopExit)
315
316	palignr	$8, %xmm1, %xmm2
317	movaps	%xmm2, (%rdx)
318	movaps	24(%rcx), %xmm2
319
320	pcmpeqd	%xmm2, %xmm0
321	addq	$16, %rdx
322	pmovmskb %xmm0, %eax
323	addq	$16, %rcx
324	movaps	%xmm2, %xmm1
325
326	test	%eax, %eax
327	jnz	L(Shl8LoopExit)
328
329	palignr	$8, %xmm3, %xmm2
330	movaps	%xmm2, (%rdx)
331	movaps	24(%rcx), %xmm2
332
333	pcmpeqd	%xmm2, %xmm0
334	addq	$16, %rdx
335	pmovmskb %xmm0, %eax
336	addq	$16, %rcx
337	movaps	%xmm2, %xmm3
338
339	test	%eax, %eax
340	jnz	L(Shl8LoopExit)
341
342	palignr	$8, %xmm1, %xmm2
343	movaps	%xmm2, (%rdx)
344	movaps	24(%rcx), %xmm2
345
346	pcmpeqd	%xmm2, %xmm0
347	addq	$16, %rdx
348	pmovmskb %xmm0, %eax
349	addq	$16, %rcx
350
351	test	%eax, %eax
352	jnz	L(Shl8LoopExit)
353
354	palignr	$8, %xmm3, %xmm2
355	movaps	%xmm2, (%rdx)
356	addq	$24, %rcx
357	addq	$16, %rdx
358
359	mov	%rcx, %rax
360	and	$-0x40, %rcx
361	sub	%rcx, %rax
362	addq	$-8, %rcx
363	sub	%rax, %rdx
364
365	movaps	-8(%rcx), %xmm1
366
367	.p2align 4
368L(Shl8LoopStart):
369	movaps	8(%rcx), %xmm2
370	movaps	24(%rcx), %xmm3
371	movaps	%xmm3, %xmm6
372	movaps	40(%rcx), %xmm4
373	movaps	%xmm4, %xmm7
374	movaps	56(%rcx), %xmm5
375	pminub	%xmm2, %xmm6
376	pminub	%xmm5, %xmm7
377	pminub	%xmm6, %xmm7
378	pcmpeqd	%xmm0, %xmm7
379	pmovmskb %xmm7, %eax
380	movaps	%xmm5, %xmm7
381	palignr	$8, %xmm4, %xmm5
382	palignr	$8, %xmm3, %xmm4
383	test	%eax, %eax
384	jnz	L(Shl8Start)
385
386	palignr	$8, %xmm2, %xmm3
387	addq	$64, %rcx
388	palignr	$8, %xmm1, %xmm2
389	movaps	%xmm7, %xmm1
390	movaps	%xmm5, 48(%rdx)
391	movaps	%xmm4, 32(%rdx)
392	movaps	%xmm3, 16(%rdx)
393	movaps	%xmm2, (%rdx)
394	addq	$64, %rdx
395	jmp	L(Shl8LoopStart)
396
397L(Shl8LoopExit):
398	mov	(%rcx), %r9
399	mov	$8, %rsi
400	mov	%r9, (%rdx)
401	jmp	L(CopyFrom1To16Bytes)
402
403	.p2align 4
404L(Shl12):
405	movaps	-12(%rcx), %xmm1
406	movaps	4(%rcx), %xmm2
407L(Shl12Start):
408	pcmpeqd	%xmm2, %xmm0
409	pmovmskb %xmm0, %eax
410	movaps	%xmm2, %xmm3
411
412	test	%eax, %eax
413	jnz	L(Shl12LoopExit)
414
415	palignr	$12, %xmm1, %xmm2
416	movaps	%xmm2, (%rdx)
417	movaps	20(%rcx), %xmm2
418
419	pcmpeqd	%xmm2, %xmm0
420	addq	$16, %rdx
421	pmovmskb %xmm0, %eax
422	addq	$16, %rcx
423	movaps	%xmm2, %xmm1
424
425	test	%eax, %eax
426	jnz	L(Shl12LoopExit)
427
428	palignr	$12, %xmm3, %xmm2
429	movaps	%xmm2, (%rdx)
430	movaps	20(%rcx), %xmm2
431
432	pcmpeqd	%xmm2, %xmm0
433	addq	$16, %rdx
434	pmovmskb %xmm0, %eax
435	addq	$16, %rcx
436	movaps	%xmm2, %xmm3
437
438	test	%eax, %eax
439	jnz	L(Shl12LoopExit)
440
441	palignr	$12, %xmm1, %xmm2
442	movaps	%xmm2, (%rdx)
443	movaps	20(%rcx), %xmm2
444
445	pcmpeqd	%xmm2, %xmm0
446	addq	$16, %rdx
447	pmovmskb %xmm0, %eax
448	addq	$16, %rcx
449
450	test	%eax, %eax
451	jnz	L(Shl12LoopExit)
452
453	palignr	$12, %xmm3, %xmm2
454	movaps	%xmm2, (%rdx)
455	addq	$20, %rcx
456	addq	$16, %rdx
457
458	mov	%rcx, %rax
459	and	$-0x40, %rcx
460	sub	%rcx, %rax
461	addq	$-4, %rcx
462	sub	%rax, %rdx
463
464	movaps	-12(%rcx), %xmm1
465
466	.p2align 4
467L(Shl12LoopStart):
468	movaps	4(%rcx), %xmm2
469	movaps	20(%rcx), %xmm3
470	movaps	%xmm3, %xmm6
471	movaps	36(%rcx), %xmm4
472	movaps	%xmm4, %xmm7
473	movaps	52(%rcx), %xmm5
474	pminub	%xmm2, %xmm6
475	pminub	%xmm5, %xmm7
476	pminub	%xmm6, %xmm7
477	pcmpeqd	%xmm0, %xmm7
478	pmovmskb %xmm7, %eax
479	movaps	%xmm5, %xmm7
480	palignr	$12, %xmm4, %xmm5
481	palignr	$12, %xmm3, %xmm4
482	test	%eax, %eax
483	jnz	L(Shl12Start)
484	palignr	$12, %xmm2, %xmm3
485	addq	$64, %rcx
486	palignr	$12, %xmm1, %xmm2
487	movaps	%xmm7, %xmm1
488	movaps	%xmm5, 48(%rdx)
489	movaps	%xmm4, 32(%rdx)
490	movaps	%xmm3, 16(%rdx)
491	movaps	%xmm2, (%rdx)
492	addq	$64, %rdx
493	jmp	L(Shl12LoopStart)
494
495L(Shl12LoopExit):
496	mov	(%rcx), %r9d
497	mov	$4, %rsi
498	mov	%r9d, (%rdx)
499	jmp	L(CopyFrom1To16Bytes)
500
501	.p2align 4
502L(CopyFrom1To16Bytes):
503	add	%rsi, %rdx
504	add	%rsi, %rcx
505
506	test	%al, %al
507	jz	L(ExitHigh)
508	test	$0x01, %al
509	jnz	L(Exit4)
510
511	mov	(%rcx), %rax
512	mov	%rax, (%rdx)
513	mov	%rdi, %rax
514	ret
515
516	.p2align 4
517L(ExitHigh):
518	test	$0x01, %ah
519	jnz	L(Exit12)
520
521	mov	(%rcx), %rax
522	mov	%rax, (%rdx)
523	mov	8(%rcx), %rax
524	mov	%rax, 8(%rdx)
525	mov	%rdi, %rax
526	ret
527
528	.p2align 4
529L(Exit4):
530	movl	(%rcx), %eax
531	movl	%eax, (%rdx)
532	mov	%rdi, %rax
533	ret
534
535	.p2align 4
536L(Exit8):
537	mov	(%rcx), %rax
538	mov	%rax, (%rdx)
539	mov	%rdi, %rax
540	ret
541
542	.p2align 4
543L(Exit12):
544	mov	(%rcx), %rax
545	mov	%rax, (%rdx)
546	mov	8(%rcx), %eax
547	mov	%eax, 8(%rdx)
548	mov	%rdi, %rax
549	ret
550
551	.p2align 4
552L(Exit16):
553	mov	(%rcx), %rax
554	mov	%rax, (%rdx)
555	mov	8(%rcx), %rax
556	mov	%rax, 8(%rdx)
557	mov	%rdi, %rax
558	ret
559
560END(WCSCPY)
561#endif
562