1/* memcmp with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19
20#include <isa-level.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23   so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26#include <sysdep.h>
27
28# ifndef MEMCMP
29#  define MEMCMP __memcmp_sse2
30# endif
31
32# ifdef USE_AS_WMEMCMP
33#  define PCMPEQ	pcmpeqd
34#  define CHAR_SIZE	4
35#  define SIZE_OFFSET	(0)
36# else
37#  define PCMPEQ	pcmpeqb
38#  define CHAR_SIZE	1
39# endif
40
41# ifdef USE_AS_MEMCMPEQ
42#  define SIZE_OFFSET	(0)
43#  define CHECK_CMP(x, y)	subl x, y
44# else
45#  ifndef SIZE_OFFSET
46#   define SIZE_OFFSET	(CHAR_PER_VEC * 2)
47#  endif
48#  define CHECK_CMP(x, y)	cmpl x, y
49# endif
50
51# define VEC_SIZE	16
52# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
53
54# ifndef MEMCMP
55#  define MEMCMP	memcmp
56# endif
57
58	.text
59ENTRY(MEMCMP)
60#  ifdef __ILP32__
61	/* Clear the upper 32 bits.  */
62	movl	%edx, %edx
63#  endif
64# ifdef USE_AS_WMEMCMP
65	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
66	   in ecx for code size. This is preferable to using `incw` as
67	   it avoids partial register stalls on older hardware (pre
68	   SnB).  */
69	movl	$0xffff, %ecx
70# endif
71	cmpq	$CHAR_PER_VEC, %rdx
72	ja	L(more_1x_vec)
73
74# ifdef USE_AS_WMEMCMP
75	/* saves a byte of code keeping the fall through path n = [2, 4]
76	   in the initial cache line.  */
77	decl	%edx
78	jle	L(cmp_0_1)
79
80	movq	(%rsi), %xmm0
81	movq	(%rdi), %xmm1
82	PCMPEQ	%xmm0, %xmm1
83	pmovmskb %xmm1, %eax
84	subl	%ecx, %eax
85	jnz	L(ret_nonzero_vec_start_0)
86
87	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
88	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
89	PCMPEQ	%xmm0, %xmm1
90	pmovmskb %xmm1, %eax
91	subl	%ecx, %eax
92	jnz	L(ret_nonzero_vec_end_0_adj)
93# else
94	cmpl	$8, %edx
95	ja	L(cmp_9_16)
96
97	cmpl	$4, %edx
98	jb	L(cmp_0_3)
99
100#  ifdef USE_AS_MEMCMPEQ
101	movl	(%rsi), %eax
102	subl	(%rdi), %eax
103
104	movl	-4(%rsi, %rdx), %esi
105	subl	-4(%rdi, %rdx), %esi
106
107	orl	%esi, %eax
108	ret
109#  else
110	/* Combine comparisons for lo and hi 4-byte comparisons.  */
111	movl	-4(%rsi, %rdx), %ecx
112	movl	-4(%rdi, %rdx), %eax
113	shlq	$32, %rcx
114	shlq	$32, %rax
115	movl	(%rsi), %esi
116	movl	(%rdi), %edi
117	orq	%rsi, %rcx
118	orq	%rdi, %rax
119	/* Only compute proper return if not-equal.  */
120	cmpq	%rcx, %rax
121	jnz	L(ret_nonzero)
122	xorl	%eax, %eax
123	ret
124#  endif
125
126	.p2align 4,, 10
127L(cmp_9_16):
128#  ifdef USE_AS_MEMCMPEQ
129	movq	(%rsi), %rax
130	subq	(%rdi), %rax
131
132	movq	-8(%rsi, %rdx), %rcx
133	subq	-8(%rdi, %rdx), %rcx
134	orq	%rcx, %rax
135	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
136	   return long).  */
137	setnz	%cl
138	movzbl	%cl, %eax
139#  else
140	movq	(%rsi), %rcx
141	movq	(%rdi), %rax
142	/* Only compute proper return if not-equal.  */
143	cmpq	%rcx, %rax
144	jnz	L(ret_nonzero)
145
146	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
147	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
148	/* Only compute proper return if not-equal.  */
149	cmpq	%rcx, %rax
150	jnz	L(ret_nonzero)
151	xorl	%eax, %eax
152#  endif
153# endif
154	ret
155
156	.p2align 4,, 8
157L(cmp_0_1):
158	/* Flag set by earlier comparison against 1.  */
159	jne	L(cmp_0_0)
160# ifdef USE_AS_WMEMCMP
161	movl	(%rdi), %ecx
162	xorl	%edx, %edx
163	cmpl	(%rsi), %ecx
164	je	L(cmp_0_0)
165	setg	%dl
166	leal	-1(%rdx, %rdx), %eax
167# else
168	movzbl	(%rdi), %eax
169	movzbl	(%rsi), %ecx
170	subl	%ecx, %eax
171# endif
172	ret
173
174	/* Fits in aligning bytes.  */
175L(cmp_0_0):
176	xorl	%eax, %eax
177	ret
178
179# ifdef USE_AS_WMEMCMP
180	.p2align 4
181L(ret_nonzero_vec_start_0):
182	bsfl	%eax, %eax
183	movl	(%rdi, %rax), %ecx
184	xorl	%edx, %edx
185	cmpl	(%rsi, %rax), %ecx
186	/* NB: no partial register stall here because xorl zero idiom
187	   above.  */
188	setg	%dl
189	leal	-1(%rdx, %rdx), %eax
190	ret
191# else
192
193#  ifndef USE_AS_MEMCMPEQ
194	.p2align 4,, 14
195L(ret_nonzero):
196	/* Need to bswap to get proper return without branch.  */
197	bswapq	%rcx
198	bswapq	%rax
199	subq	%rcx, %rax
200	sbbl	%eax, %eax
201	orl	$1, %eax
202	ret
203#  endif
204
205	.p2align 4
206L(cmp_0_3):
207#  ifdef USE_AS_MEMCMPEQ
208	/* No reason to add to dependency chain on rdx. Saving a the
209	   bytes here doesn't change number of fetch blocks.  */
210	cmpl	$1, %edx
211	jbe	L(cmp_0_1)
212#  else
213	/* We need the code size to prevent taking an extra fetch block.
214	 */
215	decl	%edx
216	jle	L(cmp_0_1)
217#  endif
218	movzwl	(%rsi), %ecx
219	movzwl	(%rdi), %eax
220
221#  ifdef USE_AS_MEMCMPEQ
222	subl	%ecx, %eax
223
224	movzbl	-1(%rsi, %rdx), %esi
225	movzbl	-1(%rdi, %rdx), %edi
226	subl	%edi, %esi
227	orl	%esi, %eax
228#  else
229	bswapl	%ecx
230	bswapl	%eax
231
232	/* Implicit right shift by one. We just need to displace the
233	   sign bits.  */
234	shrl	%ecx
235	shrl	%eax
236
237	/* Eat a partial register stall here. Saves code stopping
238	   L(cmp_0_3) from bleeding into the next fetch block and saves
239	   an ALU.  */
240	movb	(%rsi, %rdx), %cl
241	movzbl	(%rdi, %rdx), %edi
242	orl	%edi, %eax
243	subl	%ecx, %eax
244#  endif
245	ret
246# endif
247
248	.p2align 5
249L(more_1x_vec):
250# ifndef USE_AS_WMEMCMP
251	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
252	   in ecx for code size. This is preferable to using `incw` as
253	   it avoids partial register stalls on older hardware (pre
254	   SnB).  */
255	movl	$0xffff, %ecx
256# endif
257	movups	(%rsi), %xmm0
258	movups	(%rdi), %xmm1
259	PCMPEQ	%xmm0, %xmm1
260	pmovmskb %xmm1, %eax
261	subl	%ecx, %eax
262	jnz	L(ret_nonzero_vec_start_0)
263# if SIZE_OFFSET == 0
264	cmpq	$(CHAR_PER_VEC * 2), %rdx
265# else
266	/* Offset rdx. Saves just enough code size to keep the
267	   L(last_2x_vec) case and the non-zero return in a single
268	   cache line.  */
269	subq	$(CHAR_PER_VEC * 2), %rdx
270# endif
271	ja	L(more_2x_vec)
272
273	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
275	PCMPEQ	%xmm0, %xmm1
276	pmovmskb %xmm1, %eax
277	subl	%ecx, %eax
278# ifndef USE_AS_MEMCMPEQ
279	/* Don't use `incw ax` as machines this code runs on are liable
280	   to have partial register stall.  */
281	jnz	L(ret_nonzero_vec_end_0)
282# else
283	/* Various return targets for memcmpeq. Will always be hot in
284	   Icache and get short encoding.  */
285L(ret_nonzero_vec_start_1):
286L(ret_nonzero_vec_start_0):
287L(ret_nonzero_vec_end_0):
288# endif
289	ret
290
291# ifndef USE_AS_MEMCMPEQ
292#  ifdef USE_AS_WMEMCMP
293	.p2align 4
294L(ret_nonzero_vec_end_0_adj):
295	addl	$3, %edx
296#  else
297	.p2align 4,, 8
298#  endif
299L(ret_nonzero_vec_end_0):
300	bsfl	%eax, %eax
301#  ifdef USE_AS_WMEMCMP
302	leal	(%rax, %rdx, CHAR_SIZE), %eax
303	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
304	xorl	%edx, %edx
305	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
306	/* NB: no partial register stall here because xorl zero idiom
307	   above.  */
308	setg	%dl
309	leal	-1(%rdx, %rdx), %eax
310#  else
311	addl	%edx, %eax
312	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
313	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
314	subl	%ecx, %eax
315#  endif
316	ret
317#  ifndef USE_AS_WMEMCMP
318	.p2align 4,, 10
319L(ret_nonzero_vec_start_0):
320	bsfl	%eax, %eax
321	movzbl	(%rsi, %rax), %ecx
322	movzbl	(%rdi, %rax), %eax
323	subl	%ecx, %eax
324	ret
325#  endif
326# else
327# endif
328
329	.p2align 5
330L(more_2x_vec):
331	movups	(VEC_SIZE * 1)(%rsi), %xmm0
332	movups	(VEC_SIZE * 1)(%rdi), %xmm1
333	PCMPEQ	%xmm0, %xmm1
334	pmovmskb %xmm1, %eax
335	subl	%ecx, %eax
336	jnz	L(ret_nonzero_vec_start_1)
337
338	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
339	jbe	L(last_2x_vec)
340
341	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
342	ja	L(more_8x_vec)
343
344	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
345	   This can harm performance if non-zero return in [65, 80] or
346	   [97, 112] but helps performance otherwise. Generally zero-
347	   return is hotter.  */
348	movups	(VEC_SIZE * 2)(%rsi), %xmm0
349	movups	(VEC_SIZE * 2)(%rdi), %xmm1
350	PCMPEQ	%xmm0, %xmm1
351	movups	(VEC_SIZE * 3)(%rsi), %xmm2
352	movups	(VEC_SIZE * 3)(%rdi), %xmm3
353	PCMPEQ	%xmm2, %xmm3
354	pand	%xmm1, %xmm3
355
356	pmovmskb %xmm3, %eax
357	CHECK_CMP (%ecx, %eax)
358	jnz	L(ret_nonzero_vec_start_2_3)
359
360	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
361	jbe	L(last_2x_vec)
362
363	movups	(VEC_SIZE * 4)(%rsi), %xmm0
364	movups	(VEC_SIZE * 4)(%rdi), %xmm1
365	PCMPEQ	%xmm0, %xmm1
366	movups	(VEC_SIZE * 5)(%rsi), %xmm2
367	movups	(VEC_SIZE * 5)(%rdi), %xmm3
368	PCMPEQ	%xmm2, %xmm3
369	pand	%xmm1, %xmm3
370
371	pmovmskb %xmm3, %eax
372	CHECK_CMP (%ecx, %eax)
373# ifdef USE_AS_MEMCMPEQ
374	jz	L(last_2x_vec)
375	ret
376# else
377	jnz	L(ret_nonzero_vec_start_4_5)
378# endif
379	.p2align 4
380L(last_2x_vec):
381	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
382	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
383	PCMPEQ	%xmm0, %xmm1
384	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
385	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
386	PCMPEQ	%xmm2, %xmm3
387	pand	%xmm1, %xmm3
388	pmovmskb %xmm3, %eax
389	subl	%ecx, %eax
390# ifdef USE_AS_MEMCMPEQ
391	/* Various return targets for memcmpeq. Will always be hot in
392	   Icache and get short encoding.  */
393L(ret_nonzero_vec_start_2_3):
394L(ret_nonzero_vec_start_4_5):
395	ret
396# else
397	jnz	L(ret_nonzero_vec_end_1)
398	ret
399
400	.p2align 4,, 8
401L(ret_nonzero_vec_end_1):
402	pmovmskb %xmm1, %ecx
403	/* High 16 bits of eax guranteed to be all ones. Rotate them in
404	   to we can do `or + not` with just `xor`.  */
405	rorl	$16, %eax
406	xorl	%ecx, %eax
407	/* Partial register stall.  */
408
409	bsfl	%eax, %eax
410#  ifdef USE_AS_WMEMCMP
411	leal	(%rax, %rdx, CHAR_SIZE), %eax
412	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
413	xorl	%edx, %edx
414	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
415	/* NB: no partial register stall here because xorl zero idiom
416	   above.  */
417	setg	%dl
418	leal	-1(%rdx, %rdx), %eax
419#  else
420	addl	%edx, %eax
421	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
422	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
423	subl	%ecx, %eax
424#  endif
425	ret
426
427	.p2align 4
428L(ret_nonzero_vec_start_4_5):
429	pmovmskb %xmm1, %edx
430	sall	$16, %eax
431	leal	1(%rax, %rdx), %eax
432	bsfl	%eax, %eax
433#  ifdef USE_AS_WMEMCMP
434	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
435	xorl	%edx, %edx
436	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
437	/* NB: no partial register stall here because xorl zero idiom
438	   above.  */
439	setg	%dl
440	leal	-1(%rdx, %rdx), %eax
441#  else
442	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
443	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
444	subl	%ecx, %eax
445#  endif
446	ret
447
448	.p2align 4,, 8
449L(ret_nonzero_vec_start_1):
450	bsfl	%eax, %eax
451#  ifdef USE_AS_WMEMCMP
452	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
453	xorl	%edx, %edx
454	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
455	/* NB: no partial register stall here because xorl zero idiom
456	   above.  */
457	setg	%dl
458	leal	-1(%rdx, %rdx), %eax
459#  else
460	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
461	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
462	subl	%ecx, %eax
463#  endif
464	ret
465# endif
466
467	.p2align 4
468L(more_8x_vec):
469	subq	%rdi, %rsi
470	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
471	andq	$(VEC_SIZE * -1), %rdi
472	addq	%rdi, %rsi
473	.p2align 4
474L(loop_4x):
475	movups	(VEC_SIZE * 2)(%rsi), %xmm0
476	movups	(VEC_SIZE * 3)(%rsi), %xmm1
477
478	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
479	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1
480
481	movups	(VEC_SIZE * 4)(%rsi), %xmm2
482	movups	(VEC_SIZE * 5)(%rsi), %xmm3
483
484	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
485	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3
486
487	pand	%xmm0, %xmm1
488	pand	%xmm2, %xmm3
489	pand	%xmm1, %xmm3
490
491	pmovmskb %xmm3, %eax
492	subl	%ecx, %eax
493	jnz	L(ret_nonzero_loop)
494
495	addq	$(VEC_SIZE * 4), %rdi
496	addq	$(VEC_SIZE * 4), %rsi
497	cmpq	%rdi, %rdx
498	ja	L(loop_4x)
499	/* Get remaining length in edx.  */
500	subl	%edi, %edx
501	/* Restore offset so we can reuse L(last_2x_vec).  */
502	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
503# ifdef USE_AS_WMEMCMP
504	shrl	$2, %edx
505# endif
506	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
507	jbe	L(last_2x_vec)
508
509
510	movups	(VEC_SIZE * 2)(%rsi), %xmm0
511	movups	(VEC_SIZE * 2)(%rdi), %xmm1
512	PCMPEQ	%xmm0, %xmm1
513	movups	(VEC_SIZE * 3)(%rsi), %xmm2
514	movups	(VEC_SIZE * 3)(%rdi), %xmm3
515	PCMPEQ	%xmm2, %xmm3
516	pand	%xmm1, %xmm3
517
518	pmovmskb %xmm3, %eax
519	CHECK_CMP (%ecx, %eax)
520	jz	L(last_2x_vec)
521# ifdef USE_AS_MEMCMPEQ
522L(ret_nonzero_loop):
523	ret
524# else
525
526	.p2align 4
527L(ret_nonzero_vec_start_2_3):
528	pmovmskb %xmm1, %edx
529	sall	$16, %eax
530	leal	1(%rax, %rdx), %eax
531
532	bsfl	%eax, %eax
533#  ifdef USE_AS_WMEMCMP
534	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
535	xorl	%edx, %edx
536	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
537	/* NB: no partial register stall here because xorl zero idiom
538	   above.  */
539	setg	%dl
540	leal	-1(%rdx, %rdx), %eax
541#  else
542	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
543	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
544	subl	%ecx, %eax
545#  endif
546	ret
547
548	.p2align 4
549L(ret_nonzero_loop):
550	pmovmskb %xmm0, %ecx
551	pmovmskb %xmm1, %edx
552	sall	$(VEC_SIZE * 1), %edx
553	leal	1(%rcx, %rdx), %edx
554	pmovmskb %xmm2, %ecx
555	/* High 16 bits of eax guranteed to be all ones. Rotate them in
556	   to we can do `or + not` with just `xor`.  */
557	rorl	$16, %eax
558	xorl	%ecx, %eax
559
560	salq	$32, %rax
561	orq	%rdx, %rax
562
563	bsfq	%rax, %rax
564#  ifdef USE_AS_WMEMCMP
565	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
566	xorl	%edx, %edx
567	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
568	/* NB: no partial register stall here because xorl zero idiom
569	   above.  */
570	setg	%dl
571	leal	-1(%rdx, %rdx), %eax
572#  else
573	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
574	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
575	subl	%ecx, %eax
576#  endif
577	ret
578# endif
579END(MEMCMP)
580#endif
581