/* memrchr optimized with 256-bit EVEX instructions.
   Copyright (C) 2021-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

#if ISA_SHOULD_BUILD (4)

# include <sysdep.h>
# include "evex256-vecs.h"
# if VEC_SIZE != 32
#  error "VEC_SIZE != 32 unimplemented"
# endif

# ifndef MEMRCHR
#  define MEMRCHR				__memrchr_evex
# endif

# define PAGE_SIZE			4096
# define VECMATCH			VEC(0)

	.section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN(MEMRCHR, 6)
# ifdef __ILP32__
	/* Clear upper bits.  */
	and	%RDX_LP, %RDX_LP
# else
	test	%RDX_LP, %RDX_LP
# endif
	jz	L(zero_0)

	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
	   correct page cross check and 2) it correctly sets up end ptr to be
	   subtract by lzcnt aligned.  */
	leaq	-1(%rdi, %rdx), %rax
	vpbroadcastb %esi, %VECMATCH

	/* Check if we can load 1x VEC without cross a page.  */
	testl	$(PAGE_SIZE - VEC_SIZE), %eax
	jz	L(page_cross)

	/* Don't use rax for pointer here because EVEX has better encoding with
	   offset % VEC_SIZE == 0.  */
	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
	kmovd	%k0, %ecx

	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
	cmpq	$VEC_SIZE, %rdx
	ja	L(more_1x_vec)
L(ret_vec_x0_test):

	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
	   will guarantee edx (len) is less than it.  */
	lzcntl	%ecx, %ecx
	cmpl	%ecx, %edx
	jle	L(zero_0)
	subq	%rcx, %rax
	ret

	/* Fits in aligning bytes of first cache line.  */
L(zero_0):
	xorl	%eax, %eax
	ret

	.p2align 4,, 9
L(ret_vec_x0_dec):
	decq	%rax
L(ret_vec_x0):
	lzcntl	%ecx, %ecx
	subq	%rcx, %rax
	ret

	.p2align 4,, 10
L(more_1x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0)

	/* Align rax (pointer to string).  */
	andq	$-VEC_SIZE, %rax

	/* Recompute length after aligning.  */
	movq	%rax, %rdx

	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	subq	%rdi, %rdx

	cmpq	$(VEC_SIZE * 2), %rdx
	ja	L(more_2x_vec)
L(last_2x_vec):

	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
	decq	%rax
	cmpl	$VEC_SIZE, %edx
	jbe	L(ret_vec_x0_test)

	testl	%ecx, %ecx
	jnz	L(ret_vec_x0)

	/* Don't use rax for pointer here because EVEX has better encoding with
	   offset % VEC_SIZE == 0.  */
	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
	kmovd	%k0, %ecx
	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
	lzcntq	%rcx, %rcx
	cmpl	%ecx, %edx
	jle	L(zero_0)
	subq	%rcx, %rax
	ret

	/* Inexpensive place to put this regarding code size / target alignments
	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
	   in first cache line.  */
L(page_cross):
	movq	%rax, %rsi
	andq	$-VEC_SIZE, %rsi
	vpcmpb	$0, (%rsi), %VECMATCH, %k0
	kmovd	%k0, %r8d
	/* Shift out negative alignment (because we are starting from endptr and
	   working backwards).  */
	movl	%eax, %ecx
	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
	notl	%ecx
	shlxl	%ecx, %r8d, %ecx
	cmpq	%rdi, %rsi
	ja	L(more_1x_vec)
	lzcntl	%ecx, %ecx
	cmpl	%ecx, %edx
	jle	L(zero_1)
	subq	%rcx, %rax
	ret

	/* Continue creating zero labels that fit in aligning bytes and get
	   2-byte encoding / are in the same cache line as condition.  */
L(zero_1):
	xorl	%eax, %eax
	ret

	.p2align 4,, 8
L(ret_vec_x1):
	/* This will naturally add 32 to position.  */
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_2x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_dec)

	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x1)

	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	subq	$(VEC_SIZE * 4), %rdx
	ja	L(more_4x_vec)

	cmpl	$(VEC_SIZE * -1), %edx
	jle	L(ret_vec_x2_test)
L(last_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x2)


	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx
	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 3 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	ja	L(zero_1)
	ret

	.p2align 4,, 8
L(ret_vec_x2_test):
	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 2 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	ja	L(zero_1)
	ret

	.p2align 4,, 8
L(ret_vec_x2):
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(ret_vec_x3):
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_4x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x2)

	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	testl	%ecx, %ecx
	jnz	L(ret_vec_x3)

	/* Check if near end before re-aligning (otherwise might do an
	   unnecessary loop iteration).  */
	addq	$-(VEC_SIZE * 4), %rax
	cmpq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_4x_vec)

	decq	%rax
	andq	$-(VEC_SIZE * 4), %rax
	movq	%rdi, %rdx
	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
	   lengths that overflow can be valid and break the comparison.  */
	andq	$-(VEC_SIZE * 4), %rdx

	.p2align 4
L(loop_4x_vec):
	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
	   on).  */
	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1

	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4

	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
	vptestnmb %VEC(3), %VEC(3), %k2

	/* Any 1s and we found CHAR.  */
	kortestd %k2, %k4
	jnz	L(loop_end)

	addq	$-(VEC_SIZE * 4), %rax
	cmpq	%rdx, %rax
	jne	L(loop_4x_vec)

	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
	subq	$-(VEC_SIZE * 4), %rdx
	movq	%rdx, %rax
	subl	%edi, %edx
L(last_4x_vec):

	/* Used no matter what.  */
	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	cmpl	$(VEC_SIZE * 2), %edx
	jbe	L(last_2x_vec)

	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_dec)


	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	testl	%ecx, %ecx
	jnz	L(ret_vec_x1)

	/* Used no matter what.  */
	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	cmpl	$(VEC_SIZE * 3), %edx
	ja	L(last_vec)

	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 2 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	jbe	L(ret_1)
	xorl	%eax, %eax
L(ret_1):
	ret

	.p2align 4,, 6
L(loop_end):
	kmovd	%k1, %ecx
	notl	%ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_end)

	vptestnmb %VEC(2), %VEC(2), %k0
	kmovd	%k0, %ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x1_end)

	kmovd	%k2, %ecx
	kmovd	%k4, %esi
	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
	   then it won't affect the result in esi (VEC4). If ecx is non-zero
	   then CHAR in VEC3 and bsrq will use that position.  */
	salq	$32, %rcx
	orq	%rsi, %rcx
	bsrq	%rcx, %rcx
	addq	%rcx, %rax
	ret
	.p2align 4,, 4
L(ret_vec_x0_end):
	addq	$(VEC_SIZE), %rax
L(ret_vec_x1_end):
	bsrl	%ecx, %ecx
	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
	ret

END(MEMRCHR)
#endif