1/* __memcmpeq optimized with EVEX.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23/* __memcmpeq is implemented as:
24   1. Use ymm vector compares when possible. The only case where
25      vector compares is not possible for when size < VEC_SIZE
26      and loading from either s1 or s2 would cause a page cross.
27   2. Use xmm vector compare when size >= 8 bytes.
28   3. Optimistically compare up to first 4 * VEC_SIZE one at a
29      to check for early mismatches. Only do this if its guranteed the
30      work is not wasted.
31   4. If size is 8 * VEC_SIZE or less, unroll the loop.
32   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
33      area.
34   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
35   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
36   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
37
38# include <sysdep.h>
39
40# ifndef MEMCMPEQ
41#  define MEMCMPEQ	__memcmpeq_evex
42# endif
43
44# define VMOVU_MASK	vmovdqu8
45# define VMOVU	vmovdqu64
46# define VPCMP	vpcmpub
47# define VPTEST	vptestmb
48
49# define VEC_SIZE	32
50# define PAGE_SIZE	4096
51
52# define YMM0		ymm16
53# define YMM1		ymm17
54# define YMM2		ymm18
55# define YMM3		ymm19
56# define YMM4		ymm20
57# define YMM5		ymm21
58# define YMM6		ymm22
59
60
61	.section .text.evex, "ax", @progbits
62ENTRY_P2ALIGN (MEMCMPEQ, 6)
63# ifdef __ILP32__
64	/* Clear the upper 32 bits.  */
65	movl	%edx, %edx
66# endif
67	cmp	$VEC_SIZE, %RDX_LP
68	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
69	ja	L(more_1x_vec)
70
71	/* Create mask of bytes that are guranteed to be valid because
72	   of length (edx). Using masked movs allows us to skip checks for
73	   page crosses/zero size.  */
74	movl	$-1, %ecx
75	bzhil	%edx, %ecx, %ecx
76	kmovd	%ecx, %k2
77
78	/* Use masked loads as VEC_SIZE could page cross where length
79	   (edx) would not.  */
80	VMOVU_MASK (%rsi), %YMM2{%k2}
81	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
82	kmovd	%k1, %eax
83	ret
84
85
86L(last_1x_vec):
87	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
88	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
89	kmovd	%k1, %eax
90L(return_neq0):
91	ret
92
93
94
95	.p2align 4
96L(more_1x_vec):
97	/* From VEC + 1 to 2 * VEC.  */
98	VMOVU	(%rsi), %YMM1
99	/* Use compare not equals to directly check for mismatch.  */
100	VPCMP	$4,(%rdi), %YMM1, %k1
101	kmovd	%k1, %eax
102	testl	%eax, %eax
103	jnz	L(return_neq0)
104
105	cmpq	$(VEC_SIZE * 2), %rdx
106	jbe	L(last_1x_vec)
107
108	/* Check second VEC no matter what.  */
109	VMOVU	VEC_SIZE(%rsi), %YMM2
110	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
111	kmovd	%k1, %eax
112	testl	%eax, %eax
113	jnz	L(return_neq0)
114
115	/* Less than 4 * VEC.  */
116	cmpq	$(VEC_SIZE * 4), %rdx
117	jbe	L(last_2x_vec)
118
119	/* Check third and fourth VEC no matter what.  */
120	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
121	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
122	kmovd	%k1, %eax
123	testl	%eax, %eax
124	jnz	L(return_neq0)
125
126	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
127	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
128	kmovd	%k1, %eax
129	testl	%eax, %eax
130	jnz	L(return_neq0)
131
132	/* Go to 4x VEC loop.  */
133	cmpq	$(VEC_SIZE * 8), %rdx
134	ja	L(more_8x_vec)
135
136	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
137	   branches.  */
138
139	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
140	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
141	addq	%rdx, %rdi
142
143	/* Wait to load from s1 until addressed adjust due to
144	   unlamination.  */
145
146	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
147	   will have some 1s.  */
148	vpxorq	-(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
149	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
150	   oring with YMM1. Result is stored in YMM1.  */
151	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
152
153	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
154	vpxorq	-(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
155	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
156	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %YMM4
157	vpxorq	-(VEC_SIZE)(%rdi), %YMM4, %YMM4
158
159	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
160	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
161
162	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
163	VPTEST	%YMM4, %YMM4, %k1
164	kmovd	%k1, %eax
165	ret
166
167	.p2align 4
168L(more_8x_vec):
169	/* Set end of s1 in rdx.  */
170	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
171	/* rsi stores s2 - s1. This allows loop to only update one
172	   pointer.  */
173	subq	%rdi, %rsi
174	/* Align s1 pointer.  */
175	andq	$-VEC_SIZE, %rdi
176	/* Adjust because first 4x vec where check already.  */
177	subq	$-(VEC_SIZE * 4), %rdi
178	.p2align 4
179L(loop_4x_vec):
180	VMOVU	(%rsi, %rdi), %YMM1
181	vpxorq	(%rdi), %YMM1, %YMM1
182
183	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
184	vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
185
186	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
187	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
188
189	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
190	vpxorq	(VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
191
192	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
193	VPTEST	%YMM4, %YMM4, %k1
194	kmovd	%k1, %eax
195	testl	%eax, %eax
196	jnz	L(return_neq2)
197	subq	$-(VEC_SIZE * 4), %rdi
198	cmpq	%rdx, %rdi
199	jb	L(loop_4x_vec)
200
201	subq	%rdx, %rdi
202	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
203	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
204	/* rdi has 4 * VEC_SIZE - remaining length.  */
205	cmpl	$(VEC_SIZE * 3), %edi
206	jae	L(8x_last_1x_vec)
207	/* Load regardless of branch.  */
208	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
209	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
210	   oring with YMM4. Result is stored in YMM4.  */
211	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
212	cmpl	$(VEC_SIZE * 2), %edi
213	jae	L(8x_last_2x_vec)
214
215	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
216	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
217
218	VMOVU	(%rsi, %rdx), %YMM1
219	vpxorq	(%rdx), %YMM1, %YMM1
220
221	vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
222L(8x_last_1x_vec):
223L(8x_last_2x_vec):
224	VPTEST	%YMM4, %YMM4, %k1
225	kmovd	%k1, %eax
226L(return_neq2):
227	ret
228
229	.p2align 4,, 8
230L(last_2x_vec):
231	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
232	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
233	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
234	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
235	VPTEST	%YMM2, %YMM2, %k1
236	kmovd	%k1, %eax
237	ret
238
239    /* 1 Bytes from next cache line. */
240END (MEMCMPEQ)
241#endif
242