1/* wcscpy with SSSE3
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22# define CFI_PUSH(REG)	\
23	cfi_adjust_cfa_offset (4);	\
24	cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG)	\
27	cfi_adjust_cfa_offset (-4);	\
28	cfi_restore (REG)
29
30# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
31# define POP(REG)	popl REG; CFI_POP (REG)
32
33# define PARMS	4
34# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
35# define STR1	PARMS
36# define STR2	STR1+4
37# define LEN	STR2+4
38
39	atom_text_section
40ENTRY (__wcscpy_ssse3)
41	mov	STR1(%esp), %edx
42	mov	STR2(%esp), %ecx
43
44	cmp	$0, (%ecx)
45	jz	L(ExitTail4)
46	cmp	$0, 4(%ecx)
47	jz	L(ExitTail8)
48	cmp	$0, 8(%ecx)
49	jz	L(ExitTail12)
50	cmp	$0, 12(%ecx)
51	jz	L(ExitTail16)
52
53	PUSH	(%edi)
54	mov	%edx, %edi
55	PUSH	(%esi)
56	lea	16(%ecx), %esi
57
58	and	$-16, %esi
59
60	pxor	%xmm0, %xmm0
61	pcmpeqd	(%esi), %xmm0
62	movdqu	(%ecx), %xmm1
63	movdqu	%xmm1, (%edx)
64
65	pmovmskb %xmm0, %eax
66	sub	%ecx, %esi
67
68	test	%eax, %eax
69	jnz	L(CopyFrom1To16Bytes)
70
71	mov	%edx, %eax
72	lea	16(%edx), %edx
73	and	$-16, %edx
74	sub	%edx, %eax
75
76	sub	%eax, %ecx
77	mov	%ecx, %eax
78	and	$0xf, %eax
79	mov	$0, %esi
80
81	jz	L(Align16Both)
82	cmp	$4, %eax
83	je	L(Shl4)
84	cmp	$8, %eax
85	je	L(Shl8)
86	jmp	L(Shl12)
87
88L(Align16Both):
89	movaps	(%ecx), %xmm1
90	movaps	16(%ecx), %xmm2
91	movaps	%xmm1, (%edx)
92	pcmpeqd	%xmm2, %xmm0
93	pmovmskb %xmm0, %eax
94	lea	16(%esi), %esi
95
96	test	%eax, %eax
97	jnz	L(CopyFrom1To16Bytes)
98
99	movaps	16(%ecx, %esi), %xmm3
100	movaps	%xmm2, (%edx, %esi)
101	pcmpeqd	%xmm3, %xmm0
102	pmovmskb %xmm0, %eax
103	lea	16(%esi), %esi
104
105	test	%eax, %eax
106	jnz	L(CopyFrom1To16Bytes)
107
108	movaps	16(%ecx, %esi), %xmm4
109	movaps	%xmm3, (%edx, %esi)
110	pcmpeqd	%xmm4, %xmm0
111	pmovmskb %xmm0, %eax
112	lea	16(%esi), %esi
113
114	test	%eax, %eax
115	jnz	L(CopyFrom1To16Bytes)
116
117	movaps	16(%ecx, %esi), %xmm1
118	movaps	%xmm4, (%edx, %esi)
119	pcmpeqd	%xmm1, %xmm0
120	pmovmskb %xmm0, %eax
121	lea	16(%esi), %esi
122
123	test	%eax, %eax
124	jnz	L(CopyFrom1To16Bytes)
125
126	movaps	16(%ecx, %esi), %xmm2
127	movaps	%xmm1, (%edx, %esi)
128	pcmpeqd	%xmm2, %xmm0
129	pmovmskb %xmm0, %eax
130	lea	16(%esi), %esi
131
132	test	%eax, %eax
133	jnz	L(CopyFrom1To16Bytes)
134
135	movaps	16(%ecx, %esi), %xmm3
136	movaps	%xmm2, (%edx, %esi)
137	pcmpeqd	%xmm3, %xmm0
138	pmovmskb %xmm0, %eax
139	lea	16(%esi), %esi
140
141	test	%eax, %eax
142	jnz	L(CopyFrom1To16Bytes)
143
144	movaps	%xmm3, (%edx, %esi)
145	mov	%ecx, %eax
146	lea	16(%ecx, %esi), %ecx
147	and	$-0x40, %ecx
148	sub	%ecx, %eax
149	sub	%eax, %edx
150
151	mov	$-0x40, %esi
152
153L(Aligned64Loop):
154	movaps	(%ecx), %xmm2
155	movaps	32(%ecx), %xmm3
156	movaps	%xmm2, %xmm4
157	movaps	16(%ecx), %xmm5
158	movaps	%xmm3, %xmm6
159	movaps	48(%ecx), %xmm7
160	pminub	%xmm5, %xmm2
161	pminub	%xmm7, %xmm3
162	pminub	%xmm2, %xmm3
163	lea	64(%edx), %edx
164	pcmpeqd	%xmm0, %xmm3
165	lea	64(%ecx), %ecx
166	pmovmskb %xmm3, %eax
167
168	test	%eax, %eax
169	jnz	L(Aligned64Leave)
170	movaps	%xmm4, -64(%edx)
171	movaps	%xmm5, -48(%edx)
172	movaps	%xmm6, -32(%edx)
173	movaps	%xmm7, -16(%edx)
174	jmp	L(Aligned64Loop)
175
176L(Aligned64Leave):
177	pcmpeqd	%xmm4, %xmm0
178	pmovmskb %xmm0, %eax
179	test	%eax, %eax
180	jnz	L(CopyFrom1To16Bytes)
181
182	pcmpeqd	%xmm5, %xmm0
183	pmovmskb %xmm0, %eax
184	movaps	%xmm4, -64(%edx)
185	test	%eax, %eax
186	lea	16(%esi), %esi
187	jnz	L(CopyFrom1To16Bytes)
188
189	pcmpeqd	%xmm6, %xmm0
190	pmovmskb %xmm0, %eax
191	movaps	%xmm5, -48(%edx)
192	test	%eax, %eax
193	lea	16(%esi), %esi
194	jnz	L(CopyFrom1To16Bytes)
195
196	movaps	%xmm6, -32(%edx)
197	pcmpeqd	%xmm7, %xmm0
198	pmovmskb %xmm0, %eax
199	test	%eax, %eax
200	lea	16(%esi), %esi
201	jnz	L(CopyFrom1To16Bytes)
202
203	mov	$-0x40, %esi
204	movaps	%xmm7, -16(%edx)
205	jmp	L(Aligned64Loop)
206
207	.p2align 4
208L(Shl4):
209	movaps	-4(%ecx), %xmm1
210	movaps	12(%ecx), %xmm2
211L(Shl4Start):
212	pcmpeqd	%xmm2, %xmm0
213	pmovmskb %xmm0, %eax
214	movaps	%xmm2, %xmm3
215
216	test	%eax, %eax
217	jnz	L(Shl4LoopExit)
218
219	palignr	$4, %xmm1, %xmm2
220	movaps	%xmm2, (%edx)
221	movaps	28(%ecx), %xmm2
222
223	pcmpeqd	%xmm2, %xmm0
224	lea	16(%edx), %edx
225	pmovmskb %xmm0, %eax
226	lea	16(%ecx), %ecx
227	movaps	%xmm2, %xmm1
228
229	test	%eax, %eax
230	jnz	L(Shl4LoopExit)
231
232	palignr	$4, %xmm3, %xmm2
233	movaps	%xmm2, (%edx)
234	movaps	28(%ecx), %xmm2
235
236	pcmpeqd	%xmm2, %xmm0
237	lea	16(%edx), %edx
238	pmovmskb %xmm0, %eax
239	lea	16(%ecx), %ecx
240	movaps	%xmm2, %xmm3
241
242	test	%eax, %eax
243	jnz	L(Shl4LoopExit)
244
245	palignr	$4, %xmm1, %xmm2
246	movaps	%xmm2, (%edx)
247	movaps	28(%ecx), %xmm2
248
249	pcmpeqd	%xmm2, %xmm0
250	lea	16(%edx), %edx
251	pmovmskb %xmm0, %eax
252	lea	16(%ecx), %ecx
253
254	test	%eax, %eax
255	jnz	L(Shl4LoopExit)
256
257	palignr	$4, %xmm3, %xmm2
258	movaps	%xmm2, (%edx)
259	lea	28(%ecx), %ecx
260	lea	16(%edx), %edx
261
262	mov	%ecx, %eax
263	and	$-0x40, %ecx
264	sub	%ecx, %eax
265	lea	-12(%ecx), %ecx
266	sub	%eax, %edx
267
268	movaps	-4(%ecx), %xmm1
269
270L(Shl4LoopStart):
271	movaps	12(%ecx), %xmm2
272	movaps	28(%ecx), %xmm3
273	movaps	%xmm3, %xmm6
274	movaps	44(%ecx), %xmm4
275	movaps	%xmm4, %xmm7
276	movaps	60(%ecx), %xmm5
277	pminub	%xmm2, %xmm6
278	pminub	%xmm5, %xmm7
279	pminub	%xmm6, %xmm7
280	pcmpeqd	%xmm0, %xmm7
281	pmovmskb %xmm7, %eax
282	movaps	%xmm5, %xmm7
283	palignr	$4, %xmm4, %xmm5
284	test	%eax, %eax
285	palignr	$4, %xmm3, %xmm4
286	jnz	L(Shl4Start)
287
288	palignr	$4, %xmm2, %xmm3
289	lea	64(%ecx), %ecx
290	palignr	$4, %xmm1, %xmm2
291	movaps	%xmm7, %xmm1
292	movaps	%xmm5, 48(%edx)
293	movaps	%xmm4, 32(%edx)
294	movaps	%xmm3, 16(%edx)
295	movaps	%xmm2, (%edx)
296	lea	64(%edx), %edx
297	jmp	L(Shl4LoopStart)
298
299L(Shl4LoopExit):
300	movlpd	(%ecx), %xmm0
301	movl	8(%ecx), %esi
302	movlpd	%xmm0, (%edx)
303	movl	%esi, 8(%edx)
304	POP	(%esi)
305	add	$12, %edx
306	add	$12, %ecx
307	test	%al, %al
308	jz	L(ExitHigh)
309	test	$0x01, %al
310	jnz	L(Exit4)
311	movlpd	(%ecx), %xmm0
312	movlpd	%xmm0, (%edx)
313	movl	%edi, %eax
314	RETURN
315
316	CFI_PUSH	(%esi)
317
318	.p2align 4
319L(Shl8):
320	movaps	-8(%ecx), %xmm1
321	movaps	8(%ecx), %xmm2
322L(Shl8Start):
323	pcmpeqd	%xmm2, %xmm0
324	pmovmskb %xmm0, %eax
325	movaps	%xmm2, %xmm3
326
327	test	%eax, %eax
328	jnz	L(Shl8LoopExit)
329
330	palignr	$8, %xmm1, %xmm2
331	movaps	%xmm2, (%edx)
332	movaps	24(%ecx), %xmm2
333
334	pcmpeqd	%xmm2, %xmm0
335	lea	16(%edx), %edx
336	pmovmskb %xmm0, %eax
337	lea	16(%ecx), %ecx
338	movaps	%xmm2, %xmm1
339
340	test	%eax, %eax
341	jnz	L(Shl8LoopExit)
342
343	palignr	$8, %xmm3, %xmm2
344	movaps	%xmm2, (%edx)
345	movaps	24(%ecx), %xmm2
346
347	pcmpeqd	%xmm2, %xmm0
348	lea	16(%edx), %edx
349	pmovmskb %xmm0, %eax
350	lea	16(%ecx), %ecx
351	movaps	%xmm2, %xmm3
352
353	test	%eax, %eax
354	jnz	L(Shl8LoopExit)
355
356	palignr	$8, %xmm1, %xmm2
357	movaps	%xmm2, (%edx)
358	movaps	24(%ecx), %xmm2
359
360	pcmpeqd	%xmm2, %xmm0
361	lea	16(%edx), %edx
362	pmovmskb %xmm0, %eax
363	lea	16(%ecx), %ecx
364
365	test	%eax, %eax
366	jnz	L(Shl8LoopExit)
367
368	palignr	$8, %xmm3, %xmm2
369	movaps	%xmm2, (%edx)
370	lea	24(%ecx), %ecx
371	lea	16(%edx), %edx
372
373	mov	%ecx, %eax
374	and	$-0x40, %ecx
375	sub	%ecx, %eax
376	lea	-8(%ecx), %ecx
377	sub	%eax, %edx
378
379	movaps	-8(%ecx), %xmm1
380
381L(Shl8LoopStart):
382	movaps	8(%ecx), %xmm2
383	movaps	24(%ecx), %xmm3
384	movaps	%xmm3, %xmm6
385	movaps	40(%ecx), %xmm4
386	movaps	%xmm4, %xmm7
387	movaps	56(%ecx), %xmm5
388	pminub	%xmm2, %xmm6
389	pminub	%xmm5, %xmm7
390	pminub	%xmm6, %xmm7
391	pcmpeqd	%xmm0, %xmm7
392	pmovmskb %xmm7, %eax
393	movaps	%xmm5, %xmm7
394	palignr	$8, %xmm4, %xmm5
395	test	%eax, %eax
396	palignr	$8, %xmm3, %xmm4
397	jnz	L(Shl8Start)
398
399	palignr	$8, %xmm2, %xmm3
400	lea	64(%ecx), %ecx
401	palignr	$8, %xmm1, %xmm2
402	movaps	%xmm7, %xmm1
403	movaps	%xmm5, 48(%edx)
404	movaps	%xmm4, 32(%edx)
405	movaps	%xmm3, 16(%edx)
406	movaps	%xmm2, (%edx)
407	lea	64(%edx), %edx
408	jmp	L(Shl8LoopStart)
409
410L(Shl8LoopExit):
411	movlpd	(%ecx), %xmm0
412	movlpd	%xmm0, (%edx)
413	POP	(%esi)
414	add	$8, %edx
415	add	$8, %ecx
416	test	%al, %al
417	jz	L(ExitHigh)
418	test	$0x01, %al
419	jnz	L(Exit4)
420	movlpd	(%ecx), %xmm0
421	movlpd	%xmm0, (%edx)
422	movl	%edi, %eax
423	RETURN
424
425	CFI_PUSH	(%esi)
426
427	.p2align 4
428L(Shl12):
429	movaps	-12(%ecx), %xmm1
430	movaps	4(%ecx), %xmm2
431L(Shl12Start):
432	pcmpeqd	%xmm2, %xmm0
433	pmovmskb %xmm0, %eax
434	movaps	%xmm2, %xmm3
435
436	test	%eax, %eax
437	jnz	L(Shl12LoopExit)
438
439	palignr	$12, %xmm1, %xmm2
440	movaps	%xmm2, (%edx)
441	movaps	20(%ecx), %xmm2
442
443	pcmpeqd	%xmm2, %xmm0
444	lea	16(%edx), %edx
445	pmovmskb %xmm0, %eax
446	lea	16(%ecx), %ecx
447	movaps	%xmm2, %xmm1
448
449	test	%eax, %eax
450	jnz	L(Shl12LoopExit)
451
452	palignr	$12, %xmm3, %xmm2
453	movaps	%xmm2, (%edx)
454	movaps	20(%ecx), %xmm2
455
456	pcmpeqd	%xmm2, %xmm0
457	lea	16(%edx), %edx
458	pmovmskb %xmm0, %eax
459	lea	16(%ecx), %ecx
460	movaps	%xmm2, %xmm3
461
462	test	%eax, %eax
463	jnz	L(Shl12LoopExit)
464
465	palignr	$12, %xmm1, %xmm2
466	movaps	%xmm2, (%edx)
467	movaps	20(%ecx), %xmm2
468
469	pcmpeqd	%xmm2, %xmm0
470	lea	16(%edx), %edx
471	pmovmskb %xmm0, %eax
472	lea	16(%ecx), %ecx
473
474	test	%eax, %eax
475	jnz	L(Shl12LoopExit)
476
477	palignr	$12, %xmm3, %xmm2
478	movaps	%xmm2, (%edx)
479	lea	20(%ecx), %ecx
480	lea	16(%edx), %edx
481
482	mov	%ecx, %eax
483	and	$-0x40, %ecx
484	sub	%ecx, %eax
485	lea	-4(%ecx), %ecx
486	sub	%eax, %edx
487
488	movaps	-12(%ecx), %xmm1
489
490L(Shl12LoopStart):
491	movaps	4(%ecx), %xmm2
492	movaps	20(%ecx), %xmm3
493	movaps	%xmm3, %xmm6
494	movaps	36(%ecx), %xmm4
495	movaps	%xmm4, %xmm7
496	movaps	52(%ecx), %xmm5
497	pminub	%xmm2, %xmm6
498	pminub	%xmm5, %xmm7
499	pminub	%xmm6, %xmm7
500	pcmpeqd	%xmm0, %xmm7
501	pmovmskb %xmm7, %eax
502	movaps	%xmm5, %xmm7
503	palignr	$12, %xmm4, %xmm5
504	test	%eax, %eax
505	palignr	$12, %xmm3, %xmm4
506	jnz	L(Shl12Start)
507
508	palignr	$12, %xmm2, %xmm3
509	lea	64(%ecx), %ecx
510	palignr	$12, %xmm1, %xmm2
511	movaps	%xmm7, %xmm1
512	movaps	%xmm5, 48(%edx)
513	movaps	%xmm4, 32(%edx)
514	movaps	%xmm3, 16(%edx)
515	movaps	%xmm2, (%edx)
516	lea	64(%edx), %edx
517	jmp	L(Shl12LoopStart)
518
519L(Shl12LoopExit):
520	movl	(%ecx), %esi
521	movl	%esi, (%edx)
522	mov	$4, %esi
523
524	.p2align 4
525L(CopyFrom1To16Bytes):
526	add	%esi, %edx
527	add	%esi, %ecx
528
529	POP	(%esi)
530	test	%al, %al
531	jz	L(ExitHigh)
532	test	$0x01, %al
533	jnz	L(Exit4)
534L(Exit8):
535	movlpd	(%ecx), %xmm0
536	movlpd	%xmm0, (%edx)
537	movl	%edi, %eax
538	RETURN
539
540	.p2align 4
541L(ExitHigh):
542	test	$0x01, %ah
543	jnz	L(Exit12)
544L(Exit16):
545	movdqu	(%ecx), %xmm0
546	movdqu	%xmm0, (%edx)
547	movl	%edi, %eax
548	RETURN
549
550	.p2align 4
551L(Exit4):
552	movl	(%ecx), %eax
553	movl	%eax, (%edx)
554	movl	%edi, %eax
555	RETURN
556
557	.p2align 4
558L(Exit12):
559	movlpd	(%ecx), %xmm0
560	movlpd	%xmm0, (%edx)
561	movl	8(%ecx), %eax
562	movl	%eax, 8(%edx)
563	movl	%edi, %eax
564	RETURN
565
566CFI_POP	(%edi)
567
568	.p2align 4
569L(ExitTail4):
570	movl	(%ecx), %eax
571	movl	%eax, (%edx)
572	movl	%edx, %eax
573	ret
574
575	.p2align 4
576L(ExitTail8):
577	movlpd	(%ecx), %xmm0
578	movlpd	%xmm0, (%edx)
579	movl	%edx, %eax
580	ret
581
582	.p2align 4
583L(ExitTail12):
584	movlpd	(%ecx), %xmm0
585	movlpd	%xmm0, (%edx)
586	movl	8(%ecx), %eax
587	movl	%eax, 8(%edx)
588	movl	%edx, %eax
589	ret
590
591	.p2align 4
592L(ExitTail16):
593	movdqu	(%ecx), %xmm0
594	movdqu	%xmm0, (%edx)
595	movl	%edx, %eax
596	ret
597
598END (__wcscpy_ssse3)
599#endif
600