/* Optimized memcmp implementation for POWER10.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
. */
#include
/* TODO: Replace macros by the actual instructions when minimum binutils becomes
>= 2.35. This is used to keep compatibility with older versions. */
#define VEXTRACTBM(rt,vrb) \
.long(((4)<<(32-6)) \
| ((rt)<<(32-11)) \
| ((8)<<(32-16)) \
| ((vrb)<<(32-21)) \
| 1602)
#define LXVP(xtp,dq,ra) \
.long(((6)<<(32-6)) \
| ((((xtp)-32)>>1)<<(32-10)) \
| ((1)<<(32-11)) \
| ((ra)<<(32-16)) \
| dq)
/* Compare 32 bytes. */
#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\
LXVP(32+vr1,offset,r3); \
LXVP(32+vr2,offset,r4); \
vcmpneb. v5,vr1+1,vr2+1; \
bne cr6,L(tail_2); \
vcmpneb. v4,vr1,vr2; \
bne cr6,L(tail_1); \
#define TAIL(v_res,s1,s2) \
vctzlsbb r7,v_res; \
vextubrx r8,r7,s1; \
vextubrx r9,r7,s2; \
subf r3,r9,r8; \
blr; \
/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4],
size_t size [r5]) */
#ifndef MEMCMP
# define MEMCMP memcmp
#endif
.machine power9
ENTRY_TOCLESS (MEMCMP, 4)
CALL_MCOUNT 3
cmpldi cr6,r5,64
bgt cr6,L(loop_head)
/* Compare 64 bytes. This section is used for lengths <= 64 and for the last
bytes for larger lengths. */
L(last_compare):
li r8,16
sldi r9,r5,56
sldi r8,r8,56
addi r6,r3,16
addi r7,r4,16
/* Align up to 16 bytes. */
lxvl 32+v0,r3,r9
lxvl 32+v2,r4,r9
/* The sub. and vcmpneb. results are concatenated by the crnand in order
to do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) then
loading to CR0.LT. That means r9 is not bigger than 0 and v4 is not
all equal to 0. */
sub. r9,r9,r8
vcmpneb. v4,v0,v2
crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
bt 4*cr0+lt,L(tail1)
addi r3,r3,32
addi r4,r4,32
lxvl 32+v1,r6,r9
lxvl 32+v3,r7,r9
sub. r9,r9,r8
vcmpneb. v5,v1,v3
crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
bt 4*cr0+lt,L(tail2)
addi r6,r3,16
addi r7,r4,16
lxvl 32+v6,r3,r9
lxvl 32+v8,r4,r9
sub. r9,r9,r8
vcmpneb. v4,v6,v8
crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
bt 4*cr0+lt,L(tail3)
lxvl 32+v7,r6,r9
lxvl 32+v9,r7,r9
vcmpneb. v5,v7,v9
bne cr6,L(tail4)
L(finish):
/* The contents are equal. */
li r3,0
blr
L(loop_head):
/* Calculate how many loops to run. */
srdi. r8,r5,7
beq L(loop_tail)
mtctr r8
/* Main loop. Compares 128 bytes each loop. */
.p2align 5
L(loop_128):
COMPARE_32(v0,v2,0,tail1,tail2)
COMPARE_32(v6,v8,32,tail3,tail4)
COMPARE_32(v10,v12,64,tail5,tail6)
COMPARE_32(v14,v16,96,tail7,tail8)
addi r3,r3,128
addi r4,r4,128
bdnz L(loop_128)
/* Account loop comparisons. */
clrldi. r5,r5,57
beq L(finish)
/* Compares 64 bytes if length is still bigger than 64 bytes. */
.p2align 5
L(loop_tail):
cmpldi r5,64
ble L(last_compare)
COMPARE_32(v0,v2,0,tail1,tail2)
COMPARE_32(v6,v8,32,tail3,tail4)
addi r3,r3,64
addi r4,r4,64
subi r5,r5,64
b L(last_compare)
L(tail1):
TAIL(v4,v0,v2)
L(tail2):
TAIL(v5,v1,v3)
L(tail3):
TAIL(v4,v6,v8)
L(tail4):
TAIL(v5,v7,v9)
L(tail5):
TAIL(v4,v10,v12)
L(tail6):
TAIL(v5,v11,v13)
L(tail7):
TAIL(v4,v14,v16)
L(tail8):
TAIL(v5,v15,v17)
END (MEMCMP)
libc_hidden_builtin_def (memcmp)
weak_alias (memcmp, bcmp)
strong_alias (memcmp, __memcmpeq)
libc_hidden_def (__memcmpeq)