1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24/* memcmp/wmemcmp is implemented as:
25   1. Use ymm vector compares when possible. The only case where
26      vector compares is not possible for when size < CHAR_PER_VEC
27      and loading from either s1 or s2 would cause a page cross.
28   2. For size from 2 to 7 bytes on page cross, load as big endian
29      with movbe and bswap to avoid branches.
30   3. Use xmm vector compare when size >= 4 bytes for memcmp or
31      size >= 8 bytes for wmemcmp.
32   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
33      to check for early mismatches. Only do this if its guranteed the
34      work is not wasted.
35   5. If size is 8 * VEC_SIZE or less, unroll the loop.
36   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
37      area.
38   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
39   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
40   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
41
42When possible the implementation tries to optimize for frontend in the
43following ways:
44Throughput:
45    1. All code sections that fit are able to run optimally out of the
46       LSD.
47    2. All code sections that fit are able to run optimally out of the
48       DSB
49    3. Basic blocks are contained in minimum number of fetch blocks
50       necessary.
51
52Latency:
53    1. Logically connected basic blocks are put in the same
54       cache-line.
55    2. Logically connected basic blocks that do not fit in the same
56       cache-line are put in adjacent lines. This can get beneficial
57       L2 spatial prefetching and L1 next-line prefetching.  */
58
59# include <sysdep.h>
60
61# ifndef MEMCMP
62#  define MEMCMP	__memcmp_evex_movbe
63# endif
64
65# define VMOVU		vmovdqu64
66
67# ifdef USE_AS_WMEMCMP
68#  define VMOVU_MASK	vmovdqu32
69#  define CHAR_SIZE	4
70#  define VPCMP	vpcmpd
71#  define VPTEST	vptestmd
72# else
73#  define VMOVU_MASK	vmovdqu8
74#  define CHAR_SIZE	1
75#  define VPCMP	vpcmpub
76#  define VPTEST	vptestmb
77# endif
78
79
80# define VEC_SIZE	32
81# define PAGE_SIZE	4096
82# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
83
84# define XMM0		xmm16
85# define XMM1		xmm17
86# define XMM2		xmm18
87# define YMM0		ymm16
88# define XMM1		xmm17
89# define XMM2		xmm18
90# define YMM1		ymm17
91# define YMM2		ymm18
92# define YMM3		ymm19
93# define YMM4		ymm20
94# define YMM5		ymm21
95# define YMM6		ymm22
96
97/* Warning!
98           wmemcmp has to use SIGNED comparison for elements.
99           memcmp has to use UNSIGNED comparison for elemnts.
100*/
101
102	.section .text.evex,"ax",@progbits
103/* Cache align memcmp entry. This allows for much more thorough
104   frontend optimization.  */
105ENTRY_P2ALIGN (MEMCMP, 6)
106# ifdef __ILP32__
107	/* Clear the upper 32 bits.  */
108	movl	%edx, %edx
109# endif
110	cmp	$CHAR_PER_VEC, %RDX_LP
111	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
112	ja	L(more_1x_vec)
113
114	/* Create mask for CHAR's we want to compare. This allows us to
115	   avoid having to include page cross logic.  */
116	movl	$-1, %ecx
117	bzhil	%edx, %ecx, %ecx
118	kmovd	%ecx, %k2
119
120	/* Safe to load full ymm with mask.  */
121	VMOVU_MASK (%rsi), %YMM2{%k2}
122	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
123	kmovd	%k1, %eax
124	testl	%eax, %eax
125	jnz	L(return_vec_0)
126	ret
127
128	.p2align 4
129L(return_vec_0):
130	tzcntl	%eax, %eax
131# ifdef USE_AS_WMEMCMP
132	movl	(%rdi, %rax, CHAR_SIZE), %ecx
133	xorl	%edx, %edx
134	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
135	/* NB: no partial register stall here because xorl zero idiom
136	   above.  */
137	setg	%dl
138	leal	-1(%rdx, %rdx), %eax
139# else
140	movzbl	(%rsi, %rax), %ecx
141	movzbl	(%rdi, %rax), %eax
142	subl	%ecx, %eax
143# endif
144	ret
145
146
147	.p2align 4
148L(more_1x_vec):
149	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
150	VMOVU	(%rsi), %YMM1
151	/* Use compare not equals to directly check for mismatch.  */
152	VPCMP	$4,(%rdi), %YMM1, %k1
153	kmovd	%k1, %eax
154	/* NB: eax must be destination register if going to
155	   L(return_vec_[0,2]). For L(return_vec_3) destination register
156	   must be ecx.  */
157	testl	%eax, %eax
158	jnz	L(return_vec_0)
159
160	cmpq	$(CHAR_PER_VEC * 2), %rdx
161	jbe	L(last_1x_vec)
162
163	/* Check second VEC no matter what.  */
164	VMOVU	VEC_SIZE(%rsi), %YMM2
165	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
166	kmovd	%k1, %eax
167	testl	%eax, %eax
168	jnz	L(return_vec_1)
169
170	/* Less than 4 * VEC.  */
171	cmpq	$(CHAR_PER_VEC * 4), %rdx
172	jbe	L(last_2x_vec)
173
174	/* Check third and fourth VEC no matter what.  */
175	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
176	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
177	kmovd	%k1, %eax
178	testl	%eax, %eax
179	jnz	L(return_vec_2)
180
181	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
182	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
183	kmovd	%k1, %ecx
184	testl	%ecx, %ecx
185	jnz	L(return_vec_3)
186
187	/* Go to 4x VEC loop.  */
188	cmpq	$(CHAR_PER_VEC * 8), %rdx
189	ja	L(more_8x_vec)
190
191	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
192	   branches.  */
193
194	/* Load first two VEC from s2 before adjusting addresses.  */
195	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
196	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
197	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
198	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
199
200	/* Wait to load from s1 until addressed adjust due to
201	   unlamination of microfusion with complex address mode.  */
202
203	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
204	   will have some 1s.  */
205	vpxorq	(%rdi), %YMM1, %YMM1
206	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
207
208	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
209	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
210
211	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
212	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
213	   oring with YMM1. Result is stored in YMM4.  */
214	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
215
216	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
217	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
218
219	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
220	 */
221	VPTEST	%YMM4, %YMM4, %k1
222	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
223	kmovd	%k1, %ecx
224	testl	%ecx, %ecx
225	jnz	L(return_vec_0_1_2_3)
226	/* NB: eax must be zero to reach here.  */
227	ret
228
229
230	.p2align 4,, 8
231L(8x_end_return_vec_0_1_2_3):
232	movq	%rdx, %rdi
233L(8x_return_vec_0_1_2_3):
234	addq	%rdi, %rsi
235L(return_vec_0_1_2_3):
236	VPTEST	%YMM1, %YMM1, %k0
237	kmovd	%k0, %eax
238	testl	%eax, %eax
239	jnz	L(return_vec_0)
240
241	VPTEST	%YMM2, %YMM2, %k0
242	kmovd	%k0, %eax
243	testl	%eax, %eax
244	jnz	L(return_vec_1)
245
246	VPTEST	%YMM3, %YMM3, %k0
247	kmovd	%k0, %eax
248	testl	%eax, %eax
249	jnz	L(return_vec_2)
250L(return_vec_3):
251	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
252	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
253	   line.  */
254	bsfl	%ecx, %ecx
255# ifdef USE_AS_WMEMCMP
256	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
257	xorl	%edx, %edx
258	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
259	setg	%dl
260	leal	-1(%rdx, %rdx), %eax
261# else
262	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
263	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
264	subl	%ecx, %eax
265# endif
266	ret
267
268
269	.p2align 4
270L(return_vec_1):
271	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
272	   fetch block.  */
273	bsfl	%eax, %eax
274# ifdef USE_AS_WMEMCMP
275	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
276	xorl	%edx, %edx
277	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
278	setg	%dl
279	leal	-1(%rdx, %rdx), %eax
280# else
281	movzbl	VEC_SIZE(%rsi, %rax), %ecx
282	movzbl	VEC_SIZE(%rdi, %rax), %eax
283	subl	%ecx, %eax
284# endif
285	ret
286
287	.p2align 4,, 10
288L(return_vec_2):
289	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
290	   fetch block.  */
291	bsfl	%eax, %eax
292# ifdef USE_AS_WMEMCMP
293	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
294	xorl	%edx, %edx
295	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
296	setg	%dl
297	leal	-1(%rdx, %rdx), %eax
298# else
299	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
300	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
301	subl	%ecx, %eax
302# endif
303	ret
304
305	.p2align 4
306L(more_8x_vec):
307	/* Set end of s1 in rdx.  */
308	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
309	/* rsi stores s2 - s1. This allows loop to only update one
310	   pointer.  */
311	subq	%rdi, %rsi
312	/* Align s1 pointer.  */
313	andq	$-VEC_SIZE, %rdi
314	/* Adjust because first 4x vec where check already.  */
315	subq	$-(VEC_SIZE * 4), %rdi
316
317	.p2align 4
318L(loop_4x_vec):
319	VMOVU	(%rsi, %rdi), %YMM1
320	vpxorq	(%rdi), %YMM1, %YMM1
321	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
322	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
323	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
324	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
325	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
326	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
327	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
328	VPTEST	%YMM4, %YMM4, %k1
329	kmovd	%k1, %ecx
330	testl	%ecx, %ecx
331	jnz	L(8x_return_vec_0_1_2_3)
332	subq	$-(VEC_SIZE * 4), %rdi
333	cmpq	%rdx, %rdi
334	jb	L(loop_4x_vec)
335
336	subq	%rdx, %rdi
337	/* rdi has 4 * VEC_SIZE - remaining length.  */
338	cmpl	$(VEC_SIZE * 3), %edi
339	jae	L(8x_last_1x_vec)
340	/* Load regardless of branch.  */
341	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
342	cmpl	$(VEC_SIZE * 2), %edi
343	jae	L(8x_last_2x_vec)
344
345	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
346
347	VMOVU	(%rsi, %rdx), %YMM1
348	vpxorq	(%rdx), %YMM1, %YMM1
349
350	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
351	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
352	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
353	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
354	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
355	VPTEST	%YMM4, %YMM4, %k1
356	kmovd	%k1, %ecx
357	testl	%ecx, %ecx
358	jnz	L(8x_end_return_vec_0_1_2_3)
359	/* NB: eax must be zero to reach here.  */
360	ret
361
362	/* Only entry is from L(more_8x_vec).  */
363	.p2align 4,, 10
364L(8x_last_2x_vec):
365	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
366	kmovd	%k1, %eax
367	testl	%eax, %eax
368	jnz	L(8x_return_vec_2)
369	/* Naturally aligned to 16 bytes.  */
370L(8x_last_1x_vec):
371	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
372	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
373	kmovd	%k1, %eax
374	testl	%eax, %eax
375	jnz	L(8x_return_vec_3)
376	ret
377
378	/* Not ideally aligned (at offset +9 bytes in fetch block) but
379	   not aligning keeps it in the same cache line as
380	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
381	   size.  */
382	.p2align 4,, 4
383L(8x_return_vec_2):
384	subq	$VEC_SIZE, %rdx
385L(8x_return_vec_3):
386	bsfl	%eax, %eax
387# ifdef USE_AS_WMEMCMP
388	leaq	(%rdx, %rax, CHAR_SIZE), %rax
389	movl	(VEC_SIZE * 3)(%rax), %ecx
390	xorl	%edx, %edx
391	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
392	setg	%dl
393	leal	-1(%rdx, %rdx), %eax
394# else
395	addq	%rdx, %rax
396	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
397	movzbl	(VEC_SIZE * 3)(%rax), %eax
398	subl	%ecx, %eax
399# endif
400	ret
401
402	.p2align 4,, 10
403L(last_2x_vec):
404	/* Check second to last VEC.  */
405	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
406	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
407	kmovd	%k1, %eax
408	testl	%eax, %eax
409	jnz	L(return_vec_1_end)
410
411	/* Check last VEC.  */
412	.p2align 4
413L(last_1x_vec):
414	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
415	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
416	kmovd	%k1, %eax
417	testl	%eax, %eax
418	jnz	L(return_vec_0_end)
419	ret
420
421
422	/* Don't align. Takes 2-fetch blocks either way and aligning
423	   will cause code to spill into another cacheline.  */
424L(return_vec_1_end):
425	/* Use bsf to save code size. This is necessary to have
426	   L(one_or_less) fit in aligning bytes between.  */
427	bsfl	%eax, %eax
428	addl	%edx, %eax
429# ifdef USE_AS_WMEMCMP
430	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
431	xorl	%edx, %edx
432	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
433	setg	%dl
434	leal	-1(%rdx, %rdx), %eax
435# else
436	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
437	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
438	subl	%ecx, %eax
439# endif
440	ret
441
442	/* Don't align. Takes 2-fetch blocks either way and aligning
443	   will cause code to spill into another cacheline.  */
444L(return_vec_0_end):
445	tzcntl	%eax, %eax
446	addl	%edx, %eax
447# ifdef USE_AS_WMEMCMP
448	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
449	xorl	%edx, %edx
450	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
451	setg	%dl
452	leal	-1(%rdx, %rdx), %eax
453# else
454	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
455	movzbl	-VEC_SIZE(%rdi, %rax), %eax
456	subl	%ecx, %eax
457# endif
458	ret
459	/* 1-byte until next cache line.  */
460
461END (MEMCMP)
462#endif
463