1/* __memcmpeq optimized with EVEX.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21/* __memcmpeq is implemented as:
22   1. Use ymm vector compares when possible. The only case where
23      vector compares is not possible for when size < VEC_SIZE
24      and loading from either s1 or s2 would cause a page cross.
25   2. Use xmm vector compare when size >= 8 bytes.
26   3. Optimistically compare up to first 4 * VEC_SIZE one at a
27      to check for early mismatches. Only do this if its guranteed the
28      work is not wasted.
29   4. If size is 8 * VEC_SIZE or less, unroll the loop.
30   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
31      area.
32   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
33   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
34   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
35
36# include <sysdep.h>
37
38# ifndef MEMCMPEQ
39#  define MEMCMPEQ	__memcmpeq_evex
40# endif
41
42# define VMOVU_MASK	vmovdqu8
43# define VMOVU	vmovdqu64
44# define VPCMP	vpcmpub
45# define VPTEST	vptestmb
46
47# define VEC_SIZE	32
48# define PAGE_SIZE	4096
49
50# define YMM0		ymm16
51# define YMM1		ymm17
52# define YMM2		ymm18
53# define YMM3		ymm19
54# define YMM4		ymm20
55# define YMM5		ymm21
56# define YMM6		ymm22
57
58
59	.section .text.evex, "ax", @progbits
60ENTRY_P2ALIGN (MEMCMPEQ, 6)
61# ifdef __ILP32__
62	/* Clear the upper 32 bits.  */
63	movl	%edx, %edx
64# endif
65	cmp	$VEC_SIZE, %RDX_LP
66	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
67	ja	L(more_1x_vec)
68
69	/* Create mask of bytes that are guranteed to be valid because
70	   of length (edx). Using masked movs allows us to skip checks for
71	   page crosses/zero size.  */
72	movl	$-1, %ecx
73	bzhil	%edx, %ecx, %ecx
74	kmovd	%ecx, %k2
75
76	/* Use masked loads as VEC_SIZE could page cross where length
77	   (edx) would not.  */
78	VMOVU_MASK (%rsi), %YMM2{%k2}
79	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
80	kmovd	%k1, %eax
81	ret
82
83
84L(last_1x_vec):
85	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
86	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
87	kmovd	%k1, %eax
88L(return_neq0):
89	ret
90
91
92
93	.p2align 4
94L(more_1x_vec):
95	/* From VEC + 1 to 2 * VEC.  */
96	VMOVU	(%rsi), %YMM1
97	/* Use compare not equals to directly check for mismatch.  */
98	VPCMP	$4,(%rdi), %YMM1, %k1
99	kmovd	%k1, %eax
100	testl	%eax, %eax
101	jnz	L(return_neq0)
102
103	cmpq	$(VEC_SIZE * 2), %rdx
104	jbe	L(last_1x_vec)
105
106	/* Check second VEC no matter what.  */
107	VMOVU	VEC_SIZE(%rsi), %YMM2
108	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
109	kmovd	%k1, %eax
110	testl	%eax, %eax
111	jnz	L(return_neq0)
112
113	/* Less than 4 * VEC.  */
114	cmpq	$(VEC_SIZE * 4), %rdx
115	jbe	L(last_2x_vec)
116
117	/* Check third and fourth VEC no matter what.  */
118	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
119	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
120	kmovd	%k1, %eax
121	testl	%eax, %eax
122	jnz	L(return_neq0)
123
124	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
125	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
126	kmovd	%k1, %eax
127	testl	%eax, %eax
128	jnz	L(return_neq0)
129
130	/* Go to 4x VEC loop.  */
131	cmpq	$(VEC_SIZE * 8), %rdx
132	ja	L(more_8x_vec)
133
134	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
135	   branches.  */
136
137	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
138	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
139	addq	%rdx, %rdi
140
141	/* Wait to load from s1 until addressed adjust due to
142	   unlamination.  */
143
144	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
145	   will have some 1s.  */
146	vpxorq	-(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
147	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
148	   oring with YMM1. Result is stored in YMM1.  */
149	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
150
151	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
152	vpxorq	-(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
153	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
154	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %YMM4
155	vpxorq	-(VEC_SIZE)(%rdi), %YMM4, %YMM4
156
157	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
158	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
159
160	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
161	VPTEST	%YMM4, %YMM4, %k1
162	kmovd	%k1, %eax
163	ret
164
165	.p2align 4
166L(more_8x_vec):
167	/* Set end of s1 in rdx.  */
168	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
169	/* rsi stores s2 - s1. This allows loop to only update one
170	   pointer.  */
171	subq	%rdi, %rsi
172	/* Align s1 pointer.  */
173	andq	$-VEC_SIZE, %rdi
174	/* Adjust because first 4x vec where check already.  */
175	subq	$-(VEC_SIZE * 4), %rdi
176	.p2align 4
177L(loop_4x_vec):
178	VMOVU	(%rsi, %rdi), %YMM1
179	vpxorq	(%rdi), %YMM1, %YMM1
180
181	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
182	vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
183
184	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
185	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
186
187	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
188	vpxorq	(VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
189
190	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
191	VPTEST	%YMM4, %YMM4, %k1
192	kmovd	%k1, %eax
193	testl	%eax, %eax
194	jnz	L(return_neq2)
195	subq	$-(VEC_SIZE * 4), %rdi
196	cmpq	%rdx, %rdi
197	jb	L(loop_4x_vec)
198
199	subq	%rdx, %rdi
200	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
201	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
202	/* rdi has 4 * VEC_SIZE - remaining length.  */
203	cmpl	$(VEC_SIZE * 3), %edi
204	jae	L(8x_last_1x_vec)
205	/* Load regardless of branch.  */
206	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
207	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
208	   oring with YMM4. Result is stored in YMM4.  */
209	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
210	cmpl	$(VEC_SIZE * 2), %edi
211	jae	L(8x_last_2x_vec)
212
213	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
214	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
215
216	VMOVU	(%rsi, %rdx), %YMM1
217	vpxorq	(%rdx), %YMM1, %YMM1
218
219	vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
220L(8x_last_1x_vec):
221L(8x_last_2x_vec):
222	VPTEST	%YMM4, %YMM4, %k1
223	kmovd	%k1, %eax
224L(return_neq2):
225	ret
226
227	.p2align 4,, 8
228L(last_2x_vec):
229	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
230	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
231	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
232	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
233	VPTEST	%YMM2, %YMM2, %k1
234	kmovd	%k1, %eax
235	ret
236
237    /* 1 Bytes from next cache line. */
238END (MEMCMPEQ)
239#endif
240