1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# define STRCMP_ISA	_evex
24# include "strcmp-naming.h"
25
26# include <sysdep.h>
27# if defined USE_AS_STRCASECMP_L
28#  include "locale-defines.h"
29# endif
30
31# ifndef STRCMP
32#  define STRCMP	__strcmp_evex
33# endif
34
35# define PAGE_SIZE	4096
36
37	/* VEC_SIZE = Number of bytes in a ymm register.  */
38# define VEC_SIZE	32
39# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
40
41# define VMOVU	vmovdqu64
42# define VMOVA	vmovdqa64
43
44# ifdef USE_AS_WCSCMP
45#  define TESTEQ	subl $0xff,
46	/* Compare packed dwords.  */
47#  define VPCMP	vpcmpd
48#  define VPMINU	vpminud
49#  define VPTESTM	vptestmd
50#  define VPTESTNM	vptestnmd
51	/* 1 dword char == 4 bytes.  */
52#  define SIZE_OF_CHAR	4
53# else
54#  define TESTEQ	incl
55	/* Compare packed bytes.  */
56#  define VPCMP	vpcmpb
57#  define VPMINU	vpminub
58#  define VPTESTM	vptestmb
59#  define VPTESTNM	vptestnmb
60	/* 1 byte char == 1 byte.  */
61#  define SIZE_OF_CHAR	1
62# endif
63
64# ifdef USE_AS_STRNCMP
65#  define LOOP_REG	r9d
66#  define LOOP_REG64	r9
67
68#  define OFFSET_REG8	r9b
69#  define OFFSET_REG	r9d
70#  define OFFSET_REG64	r9
71# else
72#  define LOOP_REG	edx
73#  define LOOP_REG64	rdx
74
75#  define OFFSET_REG8	dl
76#  define OFFSET_REG	edx
77#  define OFFSET_REG64	rdx
78# endif
79
80# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
81#  define VEC_OFFSET	0
82# else
83#  define VEC_OFFSET	(-VEC_SIZE)
84# endif
85
86# define XMM0	xmm17
87# define XMM1	xmm18
88
89# define XMM10	xmm27
90# define XMM11	xmm28
91# define XMM12	xmm29
92# define XMM13	xmm30
93# define XMM14	xmm31
94
95
96# define YMM0	ymm17
97# define YMM1	ymm18
98# define YMM2	ymm19
99# define YMM3	ymm20
100# define YMM4	ymm21
101# define YMM5	ymm22
102# define YMM6	ymm23
103# define YMM7	ymm24
104# define YMM8	ymm25
105# define YMM9	ymm26
106# define YMM10	ymm27
107# define YMM11	ymm28
108# define YMM12	ymm29
109# define YMM13	ymm30
110# define YMM14	ymm31
111
112# ifdef USE_AS_STRCASECMP_L
113#  define BYTE_LOOP_REG	OFFSET_REG
114# else
115#  define BYTE_LOOP_REG	ecx
116# endif
117
118# ifdef USE_AS_STRCASECMP_L
119#  ifdef USE_AS_STRNCMP
120#   define LOCALE_REG	rcx
121#   define LOCALE_REG_LP	RCX_LP
122#  else
123#   define LOCALE_REG	rdx
124#   define LOCALE_REG_LP	RDX_LP
125#  endif
126# endif
127
128# define LCASE_MIN_YMM	%YMM12
129# define LCASE_MAX_YMM	%YMM13
130# define CASE_ADD_YMM	%YMM14
131
132# define LCASE_MIN_XMM	%XMM12
133# define LCASE_MAX_XMM	%XMM13
134# define CASE_ADD_XMM	%XMM14
135
136	/* NB: wcsncmp uses r11 but strcasecmp is never used in
137	   conjunction with wcscmp.  */
138# define TOLOWER_BASE	%r11
139
140# ifdef USE_AS_STRCASECMP_L
141#  define _REG(x, y) x ## y
142#  define REG(x, y) _REG(x, y)
143#  define TOLOWER(reg1, reg2, ext)										\
144	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
145	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
146	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
147	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
148	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
149	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
150
151#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
153#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
154
155#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
156	TOLOWER	(s1_reg, s2_reg, ext);										\
157	VPCMP	$0, s1_reg, s2_reg, reg_out
158
159#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
160	VMOVU	s2_mem, s2_reg;												\
161	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
162
163#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
164#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
165
166#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
167#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
168
169# else
170#  define TOLOWER_gpr(...)
171#  define TOLOWER_YMM(...)
172#  define TOLOWER_XMM(...)
173
174#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
175	VPCMP	$0, s2_reg, s1_reg, reg_out
176
177#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
178
179#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
180	VPCMP	$0, s2_mem, s1_reg, reg_out
181
182#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
183# endif
184
185/* Warning!
186           wcscmp/wcsncmp have to use SIGNED comparison for elements.
187           strcmp/strncmp have to use UNSIGNED comparison for elements.
188*/
189
190/* The main idea of the string comparison (byte or dword) using 256-bit
191   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
192   latter can be on either packed bytes or dwords depending on
193   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
194   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
195   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
196   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
197   instructions.  Main loop (away from from page boundary) compares 4
198   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
199   bytes) on each loop.
200
201   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
202   is the same as strcmp, except that an a maximum offset is tracked.  If
203   the maximum offset is reached before a difference is found, zero is
204   returned.  */
205
206	.section .text.evex, "ax", @progbits
207	.align	16
208	.type	STRCMP, @function
209	.globl	STRCMP
210# ifdef USE_AS_STRCASECMP_L
211ENTRY (STRCASECMP)
212	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
213	mov	%fs:(%rax), %LOCALE_REG_LP
214
215	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
216	.p2align 4
217END (STRCASECMP)
218	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
219# endif
220
221	.p2align 4
222STRCMP:
223	cfi_startproc
224	_CET_ENDBR
225	CALL_MCOUNT
226
227# if defined USE_AS_STRCASECMP_L
228	/* We have to fall back on the C implementation for locales with
229	   encodings not matching ASCII for single bytes.  */
230#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
231	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
232#  else
233	mov	(%LOCALE_REG), %RAX_LP
234#  endif
235	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
236	jne	STRCASECMP_L_NONASCII
237	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
238# endif
239
240# ifdef USE_AS_STRNCMP
241	/* Don't overwrite LOCALE_REG (rcx) until we have pass
242	   L(one_or_less). Otherwise we might use the wrong locale in
243	   the OVERFLOW_STRCMP (strcasecmp_l).  */
244#  ifdef __ILP32__
245	/* Clear the upper 32 bits.  */
246	movl	%edx, %edx
247#  endif
248	cmp	$1, %RDX_LP
249	/* Signed comparison intentional. We use this branch to also
250	   test cases where length >= 2^63. These very large sizes can be
251	   handled with strcmp as there is no way for that length to
252	   actually bound the buffer.  */
253	jle	L(one_or_less)
254# endif
255
256# if defined USE_AS_STRCASECMP_L
257	.section .rodata.cst32, "aM", @progbits, 32
258	.align	32
259L(lcase_min):
260	.quad	0x4141414141414141
261	.quad	0x4141414141414141
262	.quad	0x4141414141414141
263	.quad	0x4141414141414141
264L(lcase_max):
265	.quad	0x1a1a1a1a1a1a1a1a
266	.quad	0x1a1a1a1a1a1a1a1a
267	.quad	0x1a1a1a1a1a1a1a1a
268	.quad	0x1a1a1a1a1a1a1a1a
269L(case_add):
270	.quad	0x2020202020202020
271	.quad	0x2020202020202020
272	.quad	0x2020202020202020
273	.quad	0x2020202020202020
274	.previous
275
276	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
277	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
278	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
279# endif
280
281	movl	%edi, %eax
282	orl	%esi, %eax
283	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
284	sall	$20, %eax
285	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
286	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
287	ja	L(page_cross)
288
289L(no_page_cross):
290	/* Safe to compare 4x vectors.  */
291	VMOVU	(%rdi), %YMM0
292	VPTESTM	%YMM0, %YMM0, %k2
293	/* Each bit cleared in K1 represents a mismatch or a null CHAR
294	   in YMM0 and 32 bytes at (%rsi).  */
295	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
296	kmovd	%k1, %ecx
297# ifdef USE_AS_STRNCMP
298	cmpq	$CHAR_PER_VEC, %rdx
299	jbe	L(vec_0_test_len)
300# endif
301
302	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
303	   wcscmp/wcsncmp.  */
304
305	/* All 1s represents all equals. TESTEQ will overflow to zero in
306	   all equals case. Otherwise 1s will carry until position of first
307	   mismatch.  */
308	TESTEQ	%ecx
309	jz	L(more_3x_vec)
310
311	.p2align 4,, 4
312L(return_vec_0):
313	tzcntl	%ecx, %ecx
314# ifdef USE_AS_WCSCMP
315	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
316	xorl	%eax, %eax
317	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
318	je	L(ret0)
319	setl	%al
320	negl	%eax
321	orl	$1, %eax
322# else
323	movzbl	(%rdi, %rcx), %eax
324	movzbl	(%rsi, %rcx), %ecx
325	TOLOWER_gpr (%rax, %eax)
326	TOLOWER_gpr (%rcx, %ecx)
327	subl	%ecx, %eax
328# endif
329L(ret0):
330	ret
331
332# ifdef USE_AS_STRNCMP
333	.p2align 4,, 4
334L(vec_0_test_len):
335	notl	%ecx
336	bzhil	%edx, %ecx, %eax
337	jnz	L(return_vec_0)
338	/* Align if will cross fetch block.  */
339	.p2align 4,, 2
340L(ret_zero):
341	xorl	%eax, %eax
342	ret
343
344	.p2align 4,, 5
345L(one_or_less):
346#  ifdef USE_AS_STRCASECMP_L
347	/* Set locale argument for strcasecmp.  */
348	movq	%LOCALE_REG, %rdx
349#  endif
350	jb	L(ret_zero)
351	/* 'nbe' covers the case where length is negative (large
352	   unsigned).  */
353	jnbe	OVERFLOW_STRCMP
354#  ifdef USE_AS_WCSCMP
355	movl	(%rdi), %edx
356	xorl	%eax, %eax
357	cmpl	(%rsi), %edx
358	je	L(ret1)
359	setl	%al
360	negl	%eax
361	orl	$1, %eax
362#  else
363	movzbl	(%rdi), %eax
364	movzbl	(%rsi), %ecx
365	TOLOWER_gpr (%rax, %eax)
366	TOLOWER_gpr (%rcx, %ecx)
367	subl	%ecx, %eax
368#  endif
369L(ret1):
370	ret
371# endif
372
373	.p2align 4,, 10
374L(return_vec_1):
375	tzcntl	%ecx, %ecx
376# ifdef USE_AS_STRNCMP
377	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
378	   worrying about underflow.  */
379	addq	$-CHAR_PER_VEC, %rdx
380	cmpq	%rcx, %rdx
381	jbe	L(ret_zero)
382# endif
383# ifdef USE_AS_WCSCMP
384	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
385	xorl	%eax, %eax
386	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
387	je	L(ret2)
388	setl	%al
389	negl	%eax
390	orl	$1, %eax
391# else
392	movzbl	VEC_SIZE(%rdi, %rcx), %eax
393	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
394	TOLOWER_gpr (%rax, %eax)
395	TOLOWER_gpr (%rcx, %ecx)
396	subl	%ecx, %eax
397# endif
398L(ret2):
399	ret
400
401	.p2align 4,, 10
402# ifdef USE_AS_STRNCMP
403L(return_vec_3):
404#  if CHAR_PER_VEC <= 16
405	sall	$CHAR_PER_VEC, %ecx
406#  else
407	salq	$CHAR_PER_VEC, %rcx
408#  endif
409# endif
410L(return_vec_2):
411# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
412	tzcntl	%ecx, %ecx
413# else
414	tzcntq	%rcx, %rcx
415# endif
416
417# ifdef USE_AS_STRNCMP
418	cmpq	%rcx, %rdx
419	jbe	L(ret_zero)
420# endif
421
422# ifdef USE_AS_WCSCMP
423	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
424	xorl	%eax, %eax
425	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
426	je	L(ret3)
427	setl	%al
428	negl	%eax
429	orl	$1, %eax
430# else
431	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
432	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
433	TOLOWER_gpr (%rax, %eax)
434	TOLOWER_gpr (%rcx, %ecx)
435	subl	%ecx, %eax
436# endif
437L(ret3):
438	ret
439
440# ifndef USE_AS_STRNCMP
441	.p2align 4,, 10
442L(return_vec_3):
443	tzcntl	%ecx, %ecx
444#  ifdef USE_AS_WCSCMP
445	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
446	xorl	%eax, %eax
447	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
448	je	L(ret4)
449	setl	%al
450	negl	%eax
451	orl	$1, %eax
452#  else
453	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
454	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
455	TOLOWER_gpr (%rax, %eax)
456	TOLOWER_gpr (%rcx, %ecx)
457	subl	%ecx, %eax
458#  endif
459L(ret4):
460	ret
461# endif
462
463	/* 32 byte align here ensures the main loop is ideally aligned
464	   for DSB.  */
465	.p2align 5
466L(more_3x_vec):
467	/* Safe to compare 4x vectors.  */
468	VMOVU	(VEC_SIZE)(%rdi), %YMM0
469	VPTESTM	%YMM0, %YMM0, %k2
470	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
471	kmovd	%k1, %ecx
472	TESTEQ	%ecx
473	jnz	L(return_vec_1)
474
475# ifdef USE_AS_STRNCMP
476	subq	$(CHAR_PER_VEC * 2), %rdx
477	jbe	L(ret_zero)
478# endif
479
480	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
481	VPTESTM	%YMM0, %YMM0, %k2
482	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
483	kmovd	%k1, %ecx
484	TESTEQ	%ecx
485	jnz	L(return_vec_2)
486
487	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
488	VPTESTM	%YMM0, %YMM0, %k2
489	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
490	kmovd	%k1, %ecx
491	TESTEQ	%ecx
492	jnz	L(return_vec_3)
493
494# ifdef USE_AS_STRNCMP
495	cmpq	$(CHAR_PER_VEC * 2), %rdx
496	jbe	L(ret_zero)
497# endif
498
499
500# ifdef USE_AS_WCSCMP
501	/* any non-zero positive value that doesn't inference with 0x1.
502	 */
503	movl	$2, %r8d
504
505# else
506	xorl	%r8d, %r8d
507# endif
508
509	/* The prepare labels are various entry points from the page
510	   cross logic.  */
511L(prepare_loop):
512
513# ifdef USE_AS_STRNCMP
514#  ifdef USE_AS_WCSCMP
515L(prepare_loop_no_len):
516	movl	%edi, %ecx
517	andl	$(VEC_SIZE * 4 - 1), %ecx
518	shrl	$2, %ecx
519	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
520#  else
521	/* Store N + (VEC_SIZE * 4) and place check at the begining of
522	   the loop.  */
523	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
524L(prepare_loop_no_len):
525#  endif
526# else
527L(prepare_loop_no_len):
528# endif
529
530	/* Align s1 and adjust s2 accordingly.  */
531	subq	%rdi, %rsi
532	andq	$-(VEC_SIZE * 4), %rdi
533L(prepare_loop_readj):
534	addq	%rdi, %rsi
535# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
536	subq	%rdi, %rdx
537# endif
538
539L(prepare_loop_aligned):
540	/* eax stores distance from rsi to next page cross. These cases
541	   need to be handled specially as the 4x loop could potentially
542	   read memory past the length of s1 or s2 and across a page
543	   boundary.  */
544	movl	$-(VEC_SIZE * 4), %eax
545	subl	%esi, %eax
546	andl	$(PAGE_SIZE - 1), %eax
547
548
549	/* Loop 4x comparisons at a time.  */
550	.p2align 4
551L(loop):
552
553	/* End condition for strncmp.  */
554# ifdef USE_AS_STRNCMP
555	subq	$(CHAR_PER_VEC * 4), %rdx
556	jbe	L(ret_zero)
557# endif
558
559	subq	$-(VEC_SIZE * 4), %rdi
560	subq	$-(VEC_SIZE * 4), %rsi
561
562	/* Check if rsi loads will cross a page boundary.  */
563	addl	$-(VEC_SIZE * 4), %eax
564	jnb	L(page_cross_during_loop)
565
566	/* Loop entry after handling page cross during loop.  */
567L(loop_skip_page_cross_check):
568	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
569	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
570	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
571	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
572
573	VPMINU	%YMM0, %YMM2, %YMM8
574	VPMINU	%YMM4, %YMM6, %YMM9
575
576	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
577	VPMINU	%YMM8, %YMM9, %YMM9
578
579	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
580	VPTESTM	%YMM9, %YMM9, %k1
581# ifndef USE_AS_STRCASECMP_L
582	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
583	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
584	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
585	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
586	   oring with YMM1. Result is stored in YMM6.  */
587	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
588# else
589	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
590	TOLOWER_YMM (%YMM0, %YMM1)
591	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
592	TOLOWER_YMM (%YMM2, %YMM3)
593	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
594	TOLOWER_YMM (%YMM4, %YMM5)
595	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
596	TOLOWER_YMM (%YMM6, %YMM7)
597	vpxorq	%YMM0, %YMM1, %YMM1
598	vpxorq	%YMM2, %YMM3, %YMM3
599	vpxorq	%YMM4, %YMM5, %YMM5
600	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
601# endif
602	/* Or together YMM3, YMM5, and YMM6.  */
603	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
604
605
606	/* A non-zero CHAR in YMM6 represents a mismatch.  */
607	VPTESTNM %YMM6, %YMM6, %k0{%k1}
608	kmovd	%k0, %LOOP_REG
609
610	TESTEQ	%LOOP_REG
611	jz	L(loop)
612
613
614	/* Find which VEC has the mismatch of end of string.  */
615	VPTESTM	%YMM0, %YMM0, %k1
616	VPTESTNM %YMM1, %YMM1, %k0{%k1}
617	kmovd	%k0, %ecx
618	TESTEQ	%ecx
619	jnz	L(return_vec_0_end)
620
621	VPTESTM	%YMM2, %YMM2, %k1
622	VPTESTNM %YMM3, %YMM3, %k0{%k1}
623	kmovd	%k0, %ecx
624	TESTEQ	%ecx
625	jnz	L(return_vec_1_end)
626
627
628	/* Handle VEC 2 and 3 without branches.  */
629L(return_vec_2_3_end):
630# ifdef USE_AS_STRNCMP
631	subq	$(CHAR_PER_VEC * 2), %rdx
632	jbe	L(ret_zero_end)
633# endif
634
635	VPTESTM	%YMM4, %YMM4, %k1
636	VPTESTNM %YMM5, %YMM5, %k0{%k1}
637	kmovd	%k0, %ecx
638	TESTEQ	%ecx
639# if CHAR_PER_VEC <= 16
640	sall	$CHAR_PER_VEC, %LOOP_REG
641	orl	%ecx, %LOOP_REG
642# else
643	salq	$CHAR_PER_VEC, %LOOP_REG64
644	orq	%rcx, %LOOP_REG64
645# endif
646L(return_vec_3_end):
647	/* LOOP_REG contains matches for null/mismatch from the loop. If
648	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
649	   must entirely be from VEC 3 which is fully represented by
650	   LOOP_REG.  */
651# if CHAR_PER_VEC <= 16
652	tzcntl	%LOOP_REG, %LOOP_REG
653# else
654	tzcntq	%LOOP_REG64, %LOOP_REG64
655# endif
656# ifdef USE_AS_STRNCMP
657	cmpq	%LOOP_REG64, %rdx
658	jbe	L(ret_zero_end)
659# endif
660
661# ifdef USE_AS_WCSCMP
662	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
663	xorl	%eax, %eax
664	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
665	je	L(ret5)
666	setl	%al
667	negl	%eax
668	xorl	%r8d, %eax
669# else
670	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
671	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
672	TOLOWER_gpr (%rax, %eax)
673	TOLOWER_gpr (%rcx, %ecx)
674	subl	%ecx, %eax
675	xorl	%r8d, %eax
676	subl	%r8d, %eax
677# endif
678L(ret5):
679	ret
680
681# ifdef USE_AS_STRNCMP
682	.p2align 4,, 2
683L(ret_zero_end):
684	xorl	%eax, %eax
685	ret
686# endif
687
688
689	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
690	   they use the value of `r8` to negate the return value. This is
691	   because the page cross logic can swap `rdi` and `rsi`.  */
692	.p2align 4,, 10
693# ifdef USE_AS_STRNCMP
694L(return_vec_1_end):
695#  if CHAR_PER_VEC <= 16
696	sall	$CHAR_PER_VEC, %ecx
697#  else
698	salq	$CHAR_PER_VEC, %rcx
699#  endif
700# endif
701L(return_vec_0_end):
702# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
703	tzcntl	%ecx, %ecx
704# else
705	tzcntq	%rcx, %rcx
706# endif
707
708# ifdef USE_AS_STRNCMP
709	cmpq	%rcx, %rdx
710	jbe	L(ret_zero_end)
711# endif
712
713# ifdef USE_AS_WCSCMP
714	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
715	xorl	%eax, %eax
716	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
717	je	L(ret6)
718	setl	%al
719	negl	%eax
720	/* This is the non-zero case for `eax` so just xorl with `r8d`
721	   flip is `rdi` and `rsi` where swapped.  */
722	xorl	%r8d, %eax
723# else
724	movzbl	(%rdi, %rcx), %eax
725	movzbl	(%rsi, %rcx), %ecx
726	TOLOWER_gpr (%rax, %eax)
727	TOLOWER_gpr (%rcx, %ecx)
728	subl	%ecx, %eax
729	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
730	   logic. Subtract `r8d` after xor for zero case.  */
731	xorl	%r8d, %eax
732	subl	%r8d, %eax
733# endif
734L(ret6):
735	ret
736
737# ifndef USE_AS_STRNCMP
738	.p2align 4,, 10
739L(return_vec_1_end):
740	tzcntl	%ecx, %ecx
741#  ifdef USE_AS_WCSCMP
742	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
743	xorl	%eax, %eax
744	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
745	je	L(ret7)
746	setl	%al
747	negl	%eax
748	xorl	%r8d, %eax
749#  else
750	movzbl	VEC_SIZE(%rdi, %rcx), %eax
751	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
752	TOLOWER_gpr (%rax, %eax)
753	TOLOWER_gpr (%rcx, %ecx)
754	subl	%ecx, %eax
755	xorl	%r8d, %eax
756	subl	%r8d, %eax
757#  endif
758L(ret7):
759	ret
760# endif
761
762
763	/* Page cross in rsi in next 4x VEC.  */
764
765	/* TODO: Improve logic here.  */
766	.p2align 4,, 10
767L(page_cross_during_loop):
768	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
769
770	/* Optimistically rsi and rdi and both aligned in which case we
771	   don't need any logic here.  */
772	cmpl	$-(VEC_SIZE * 4), %eax
773	/* Don't adjust eax before jumping back to loop and we will
774	   never hit page cross case again.  */
775	je	L(loop_skip_page_cross_check)
776
777	/* Check if we can safely load a VEC.  */
778	cmpl	$-(VEC_SIZE * 3), %eax
779	jle	L(less_1x_vec_till_page_cross)
780
781	VMOVA	(%rdi), %YMM0
782	VPTESTM	%YMM0, %YMM0, %k2
783	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
784	kmovd	%k1, %ecx
785	TESTEQ	%ecx
786	jnz	L(return_vec_0_end)
787
788	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
789	cmpl	$-(VEC_SIZE * 2), %eax
790	jg	L(more_2x_vec_till_page_cross)
791
792	.p2align 4,, 4
793L(less_1x_vec_till_page_cross):
794	subl	$-(VEC_SIZE * 4), %eax
795	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
796	   concerning case is first iteration if incoming s1 was near start
797	   of a page and s2 near end. If s1 was near the start of the page
798	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
799	   to read back -VEC_SIZE. If rdi is truly at the start of a page
800	   here, it means the previous page (rdi - VEC_SIZE) has already
801	   been loaded earlier so must be valid.  */
802	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
803	VPTESTM	%YMM0, %YMM0, %k2
804	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
805	/* Mask of potentially valid bits. The lower bits can be out of
806	   range comparisons (but safe regarding page crosses).  */
807
808# ifdef USE_AS_WCSCMP
809	movl	$-1, %r10d
810	movl	%esi, %ecx
811	andl	$(VEC_SIZE - 1), %ecx
812	shrl	$2, %ecx
813	shlxl	%ecx, %r10d, %ecx
814	movzbl	%cl, %r10d
815# else
816	movl	$-1, %ecx
817	shlxl	%esi, %ecx, %r10d
818# endif
819
820	kmovd	%k1, %ecx
821	notl	%ecx
822
823
824# ifdef USE_AS_STRNCMP
825#  ifdef USE_AS_WCSCMP
826	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
827	   safe.  */
828	movl	%eax, %r11d
829	shrl	$2, %r11d
830	cmpq	%r11, %rdx
831#  else
832	cmpq	%rax, %rdx
833#  endif
834	jbe	L(return_page_cross_end_check)
835# endif
836	movl	%eax, %OFFSET_REG
837
838	/* Readjust eax before potentially returning to the loop.  */
839	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
840
841	andl	%r10d, %ecx
842	jz	L(loop_skip_page_cross_check)
843
844	.p2align 4,, 3
845L(return_page_cross_end):
846	tzcntl	%ecx, %ecx
847
848# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
849	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
850L(return_page_cross_cmp_mem):
851# else
852	addl	%OFFSET_REG, %ecx
853# endif
854# ifdef USE_AS_WCSCMP
855	movl	VEC_OFFSET(%rdi, %rcx), %edx
856	xorl	%eax, %eax
857	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
858	je	L(ret8)
859	setl	%al
860	negl	%eax
861	xorl	%r8d, %eax
862# else
863	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
864	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
865	TOLOWER_gpr (%rax, %eax)
866	TOLOWER_gpr (%rcx, %ecx)
867	subl	%ecx, %eax
868	xorl	%r8d, %eax
869	subl	%r8d, %eax
870# endif
871L(ret8):
872	ret
873
874# ifdef USE_AS_STRNCMP
875	.p2align 4,, 10
876L(return_page_cross_end_check):
877	andl	%r10d, %ecx
878	tzcntl	%ecx, %ecx
879	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
880#  ifdef USE_AS_WCSCMP
881	sall	$2, %edx
882#  endif
883	cmpl	%ecx, %edx
884	ja	L(return_page_cross_cmp_mem)
885	xorl	%eax, %eax
886	ret
887# endif
888
889
890	.p2align 4,, 10
891L(more_2x_vec_till_page_cross):
892	/* If more 2x vec till cross we will complete a full loop
893	   iteration here.  */
894
895	VMOVA	VEC_SIZE(%rdi), %YMM0
896	VPTESTM	%YMM0, %YMM0, %k2
897	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
898	kmovd	%k1, %ecx
899	TESTEQ	%ecx
900	jnz	L(return_vec_1_end)
901
902# ifdef USE_AS_STRNCMP
903	cmpq	$(CHAR_PER_VEC * 2), %rdx
904	jbe	L(ret_zero_in_loop_page_cross)
905# endif
906
907	subl	$-(VEC_SIZE * 4), %eax
908
909	/* Safe to include comparisons from lower bytes.  */
910	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
911	VPTESTM	%YMM0, %YMM0, %k2
912	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
913	kmovd	%k1, %ecx
914	TESTEQ	%ecx
915	jnz	L(return_vec_page_cross_0)
916
917	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
918	VPTESTM	%YMM0, %YMM0, %k2
919	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
920	kmovd	%k1, %ecx
921	TESTEQ	%ecx
922	jnz	L(return_vec_page_cross_1)
923
924# ifdef USE_AS_STRNCMP
925	/* Must check length here as length might proclude reading next
926	   page.  */
927#  ifdef USE_AS_WCSCMP
928	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
929	   safe.  */
930	movl	%eax, %r11d
931	shrl	$2, %r11d
932	cmpq	%r11, %rdx
933#  else
934	cmpq	%rax, %rdx
935#  endif
936	jbe	L(ret_zero_in_loop_page_cross)
937# endif
938
939	/* Finish the loop.  */
940	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
941	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
942	VPMINU	%YMM4, %YMM6, %YMM9
943	VPTESTM	%YMM9, %YMM9, %k1
944# ifndef USE_AS_STRCASECMP_L
945	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
946	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
947	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
948# else
949	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
950	TOLOWER_YMM (%YMM4, %YMM5)
951	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
952	TOLOWER_YMM (%YMM6, %YMM7)
953	vpxorq	%YMM4, %YMM5, %YMM5
954	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
955# endif
956	VPTESTNM %YMM6, %YMM6, %k0{%k1}
957	kmovd	%k0, %LOOP_REG
958	TESTEQ	%LOOP_REG
959	jnz	L(return_vec_2_3_end)
960
961	/* Best for code size to include ucond-jmp here. Would be faster
962	   if this case is hot to duplicate the L(return_vec_2_3_end) code
963	   as fall-through and have jump back to loop on mismatch
964	   comparison.  */
965	subq	$-(VEC_SIZE * 4), %rdi
966	subq	$-(VEC_SIZE * 4), %rsi
967	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
968# ifdef USE_AS_STRNCMP
969	subq	$(CHAR_PER_VEC * 4), %rdx
970	ja	L(loop_skip_page_cross_check)
971L(ret_zero_in_loop_page_cross):
972	xorl	%eax, %eax
973	ret
974# else
975	jmp	L(loop_skip_page_cross_check)
976# endif
977
978
979	.p2align 4,, 10
980L(return_vec_page_cross_0):
981	addl	$-VEC_SIZE, %eax
982L(return_vec_page_cross_1):
983	tzcntl	%ecx, %ecx
984# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
985	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
986#  ifdef USE_AS_STRNCMP
987#   ifdef USE_AS_WCSCMP
988	/* Must divide ecx instead of multiply rdx due to overflow.  */
989	movl	%ecx, %eax
990	shrl	$2, %eax
991	cmpq	%rax, %rdx
992#   else
993	cmpq	%rcx, %rdx
994#   endif
995	jbe	L(ret_zero_in_loop_page_cross)
996#  endif
997# else
998	addl	%eax, %ecx
999# endif
1000
1001# ifdef USE_AS_WCSCMP
1002	movl	VEC_OFFSET(%rdi, %rcx), %edx
1003	xorl	%eax, %eax
1004	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
1005	je	L(ret9)
1006	setl	%al
1007	negl	%eax
1008	xorl	%r8d, %eax
1009# else
1010	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
1011	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
1012	TOLOWER_gpr (%rax, %eax)
1013	TOLOWER_gpr (%rcx, %ecx)
1014	subl	%ecx, %eax
1015	xorl	%r8d, %eax
1016	subl	%r8d, %eax
1017# endif
1018L(ret9):
1019	ret
1020
1021
1022	.p2align 4,, 10
1023L(page_cross):
1024# ifndef USE_AS_STRNCMP
1025	/* If both are VEC aligned we don't need any special logic here.
1026	   Only valid for strcmp where stop condition is guranteed to be
1027	   reachable by just reading memory.  */
1028	testl	$((VEC_SIZE - 1) << 20), %eax
1029	jz	L(no_page_cross)
1030# endif
1031
1032	movl	%edi, %eax
1033	movl	%esi, %ecx
1034	andl	$(PAGE_SIZE - 1), %eax
1035	andl	$(PAGE_SIZE - 1), %ecx
1036
1037	xorl	%OFFSET_REG, %OFFSET_REG
1038
1039	/* Check which is closer to page cross, s1 or s2.  */
1040	cmpl	%eax, %ecx
1041	jg	L(page_cross_s2)
1042
1043	/* The previous page cross check has false positives. Check for
1044	   true positive as page cross logic is very expensive.  */
1045	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
1046	jbe	L(no_page_cross)
1047
1048
1049	/* Set r8 to not interfere with normal return value (rdi and rsi
1050	   did not swap).  */
1051# ifdef USE_AS_WCSCMP
1052	/* any non-zero positive value that doesn't inference with 0x1.
1053	 */
1054	movl	$2, %r8d
1055# else
1056	xorl	%r8d, %r8d
1057# endif
1058
1059	/* Check if less than 1x VEC till page cross.  */
1060	subl	$(VEC_SIZE * 3), %eax
1061	jg	L(less_1x_vec_till_page)
1062
1063
1064	/* If more than 1x VEC till page cross, loop throuh safely
1065	   loadable memory until within 1x VEC of page cross.  */
1066	.p2align 4,, 8
1067L(page_cross_loop):
1068	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1069	VPTESTM	%YMM0, %YMM0, %k2
1070	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1071	kmovd	%k1, %ecx
1072	TESTEQ	%ecx
1073	jnz	L(check_ret_vec_page_cross)
1074	addl	$CHAR_PER_VEC, %OFFSET_REG
1075# ifdef USE_AS_STRNCMP
1076	cmpq	%OFFSET_REG64, %rdx
1077	jbe	L(ret_zero_page_cross)
1078# endif
1079	addl	$VEC_SIZE, %eax
1080	jl	L(page_cross_loop)
1081
1082# ifdef USE_AS_WCSCMP
1083	shrl	$2, %eax
1084# endif
1085
1086
1087	subl	%eax, %OFFSET_REG
1088	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1089	   to not cross page so is safe to load. Since we have already
1090	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
1091	 */
1092	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1093	VPTESTM	%YMM0, %YMM0, %k2
1094	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1095
1096	kmovd	%k1, %ecx
1097# ifdef USE_AS_STRNCMP
1098	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
1099	cmpq	%rax, %rdx
1100	jbe	L(check_ret_vec_page_cross2)
1101#  ifdef USE_AS_WCSCMP
1102	addq	$-(CHAR_PER_VEC * 2), %rdx
1103#  else
1104	addq	%rdi, %rdx
1105#  endif
1106# endif
1107	TESTEQ	%ecx
1108	jz	L(prepare_loop_no_len)
1109
1110	.p2align 4,, 4
1111L(ret_vec_page_cross):
1112# ifndef USE_AS_STRNCMP
1113L(check_ret_vec_page_cross):
1114# endif
1115	tzcntl	%ecx, %ecx
1116	addl	%OFFSET_REG, %ecx
1117L(ret_vec_page_cross_cont):
1118# ifdef USE_AS_WCSCMP
1119	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
1120	xorl	%eax, %eax
1121	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
1122	je	L(ret12)
1123	setl	%al
1124	negl	%eax
1125	xorl	%r8d, %eax
1126# else
1127	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
1128	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
1129	TOLOWER_gpr (%rax, %eax)
1130	TOLOWER_gpr (%rcx, %ecx)
1131	subl	%ecx, %eax
1132	xorl	%r8d, %eax
1133	subl	%r8d, %eax
1134# endif
1135L(ret12):
1136	ret
1137
1138
1139# ifdef USE_AS_STRNCMP
1140	.p2align 4,, 10
1141L(check_ret_vec_page_cross2):
1142	TESTEQ	%ecx
1143L(check_ret_vec_page_cross):
1144	tzcntl	%ecx, %ecx
1145	addl	%OFFSET_REG, %ecx
1146	cmpq	%rcx, %rdx
1147	ja	L(ret_vec_page_cross_cont)
1148	.p2align 4,, 2
1149L(ret_zero_page_cross):
1150	xorl	%eax, %eax
1151	ret
1152# endif
1153
1154	.p2align 4,, 4
1155L(page_cross_s2):
1156	/* Ensure this is a true page cross.  */
1157	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
1158	jbe	L(no_page_cross)
1159
1160
1161	movl	%ecx, %eax
1162	movq	%rdi, %rcx
1163	movq	%rsi, %rdi
1164	movq	%rcx, %rsi
1165
1166	/* set r8 to negate return value as rdi and rsi swapped.  */
1167# ifdef USE_AS_WCSCMP
1168	movl	$-4, %r8d
1169# else
1170	movl	$-1, %r8d
1171# endif
1172	xorl	%OFFSET_REG, %OFFSET_REG
1173
1174	/* Check if more than 1x VEC till page cross.  */
1175	subl	$(VEC_SIZE * 3), %eax
1176	jle	L(page_cross_loop)
1177
1178	.p2align 4,, 6
1179L(less_1x_vec_till_page):
1180# ifdef USE_AS_WCSCMP
1181	shrl	$2, %eax
1182# endif
1183	/* Find largest load size we can use.  */
1184	cmpl	$(16 / SIZE_OF_CHAR), %eax
1185	ja	L(less_16_till_page)
1186
1187	/* Use 16 byte comparison.  */
1188	vmovdqu	(%rdi), %xmm0
1189	VPTESTM	%xmm0, %xmm0, %k2
1190	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1191	kmovd	%k1, %ecx
1192# ifdef USE_AS_WCSCMP
1193	subl	$0xf, %ecx
1194# else
1195	incw	%cx
1196# endif
1197	jnz	L(check_ret_vec_page_cross)
1198	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
1199# ifdef USE_AS_STRNCMP
1200	cmpq	%OFFSET_REG64, %rdx
1201	jbe	L(ret_zero_page_cross_slow_case0)
1202	subl	%eax, %OFFSET_REG
1203# else
1204	/* Explicit check for 16 byte alignment.  */
1205	subl	%eax, %OFFSET_REG
1206	jz	L(prepare_loop)
1207# endif
1208	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1209	VPTESTM	%xmm0, %xmm0, %k2
1210	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1211	kmovd	%k1, %ecx
1212# ifdef USE_AS_WCSCMP
1213	subl	$0xf, %ecx
1214# else
1215	incw	%cx
1216# endif
1217	jnz	L(check_ret_vec_page_cross)
1218# ifdef USE_AS_STRNCMP
1219	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
1220	subq	%OFFSET_REG64, %rdx
1221	jbe	L(ret_zero_page_cross_slow_case0)
1222	subq	$-(CHAR_PER_VEC * 4), %rdx
1223
1224	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1225	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1226# else
1227	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1228	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1229# endif
1230	jmp	L(prepare_loop_aligned)
1231
1232# ifdef USE_AS_STRNCMP
1233	.p2align 4,, 2
1234L(ret_zero_page_cross_slow_case0):
1235	xorl	%eax, %eax
1236	ret
1237# endif
1238
1239
1240	.p2align 4,, 10
1241L(less_16_till_page):
1242	cmpl	$(24 / SIZE_OF_CHAR), %eax
1243	ja	L(less_8_till_page)
1244
1245	/* Use 8 byte comparison.  */
1246	vmovq	(%rdi), %xmm0
1247	vmovq	(%rsi), %xmm1
1248	VPTESTM	%xmm0, %xmm0, %k2
1249	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1250	kmovd	%k1, %ecx
1251# ifdef USE_AS_WCSCMP
1252	subl	$0x3, %ecx
1253# else
1254	incb	%cl
1255# endif
1256	jnz	L(check_ret_vec_page_cross)
1257
1258
1259# ifdef USE_AS_STRNCMP
1260	cmpq	$(8 / SIZE_OF_CHAR), %rdx
1261	jbe	L(ret_zero_page_cross_slow_case0)
1262# endif
1263	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
1264	subl	%eax, %OFFSET_REG
1265
1266	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1267	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1268	VPTESTM	%xmm0, %xmm0, %k2
1269	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1270	kmovd	%k1, %ecx
1271# ifdef USE_AS_WCSCMP
1272	subl	$0x3, %ecx
1273# else
1274	incb	%cl
1275# endif
1276	jnz	L(check_ret_vec_page_cross)
1277
1278
1279# ifdef USE_AS_STRNCMP
1280	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
1281	subq	%OFFSET_REG64, %rdx
1282	jbe	L(ret_zero_page_cross_slow_case0)
1283	subq	$-(CHAR_PER_VEC * 4), %rdx
1284
1285	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1286	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1287# else
1288	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1289	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1290# endif
1291	jmp	L(prepare_loop_aligned)
1292
1293
1294
1295
1296	.p2align 4,, 10
1297L(less_8_till_page):
1298# ifdef USE_AS_WCSCMP
1299	/* If using wchar then this is the only check before we reach
1300	   the page boundary.  */
1301	movl	(%rdi), %eax
1302	movl	(%rsi), %ecx
1303	cmpl	%ecx, %eax
1304	jnz	L(ret_less_8_wcs)
1305#  ifdef USE_AS_STRNCMP
1306	addq	$-(CHAR_PER_VEC * 2), %rdx
1307	/* We already checked for len <= 1 so cannot hit that case here.
1308	 */
1309#  endif
1310	testl	%eax, %eax
1311	jnz	L(prepare_loop)
1312	ret
1313
1314	.p2align 4,, 8
1315L(ret_less_8_wcs):
1316	setl	%OFFSET_REG8
1317	negl	%OFFSET_REG
1318	movl	%OFFSET_REG, %eax
1319	xorl	%r8d, %eax
1320	ret
1321
1322# else
1323	cmpl	$28, %eax
1324	ja	L(less_4_till_page)
1325
1326	vmovd	(%rdi), %xmm0
1327	vmovd	(%rsi), %xmm1
1328	VPTESTM	%xmm0, %xmm0, %k2
1329	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1330	kmovd	%k1, %ecx
1331	subl	$0xf, %ecx
1332	jnz	L(check_ret_vec_page_cross)
1333
1334#  ifdef USE_AS_STRNCMP
1335	cmpq	$4, %rdx
1336	jbe	L(ret_zero_page_cross_slow_case1)
1337#  endif
1338	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
1339	subl	%eax, %OFFSET_REG
1340
1341	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1342	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1343	VPTESTM	%xmm0, %xmm0, %k2
1344	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1345	kmovd	%k1, %ecx
1346	subl	$0xf, %ecx
1347	jnz	L(check_ret_vec_page_cross)
1348#  ifdef USE_AS_STRNCMP
1349	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
1350	subq	%OFFSET_REG64, %rdx
1351	jbe	L(ret_zero_page_cross_slow_case1)
1352	subq	$-(CHAR_PER_VEC * 4), %rdx
1353
1354	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1355	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1356#  else
1357	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1358	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1359#  endif
1360	jmp	L(prepare_loop_aligned)
1361
1362
1363#  ifdef USE_AS_STRNCMP
1364	.p2align 4,, 2
1365L(ret_zero_page_cross_slow_case1):
1366	xorl	%eax, %eax
1367	ret
1368#  endif
1369
1370	.p2align 4,, 10
1371L(less_4_till_page):
1372	subq	%rdi, %rsi
1373	/* Extremely slow byte comparison loop.  */
1374L(less_4_loop):
1375	movzbl	(%rdi), %eax
1376	movzbl	(%rsi, %rdi), %ecx
1377	TOLOWER_gpr (%rax, %eax)
1378	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1379	subl	%BYTE_LOOP_REG, %eax
1380	jnz	L(ret_less_4_loop)
1381	testl	%ecx, %ecx
1382	jz	L(ret_zero_4_loop)
1383#  ifdef USE_AS_STRNCMP
1384	decq	%rdx
1385	jz	L(ret_zero_4_loop)
1386#  endif
1387	incq	%rdi
1388	/* end condition is reach page boundary (rdi is aligned).  */
1389	testl	$31, %edi
1390	jnz	L(less_4_loop)
1391	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1392	addq	$-(VEC_SIZE * 4), %rdi
1393#  ifdef USE_AS_STRNCMP
1394	subq	$-(CHAR_PER_VEC * 4), %rdx
1395#  endif
1396	jmp	L(prepare_loop_aligned)
1397
1398L(ret_zero_4_loop):
1399	xorl	%eax, %eax
1400	ret
1401L(ret_less_4_loop):
1402	xorl	%r8d, %eax
1403	subl	%r8d, %eax
1404	ret
1405# endif
1406	cfi_endproc
1407	.size	STRCMP, .-STRCMP
1408#endif
1409