1/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
2   For AMD x86-64.
3   Copyright (C) 2009-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22	.text
23ENTRY (strchr)
24	movd	%esi, %xmm1
25	movl	%edi, %eax
26	andl	$4095, %eax
27	punpcklbw %xmm1, %xmm1
28	cmpl	$4032, %eax
29	punpcklwd %xmm1, %xmm1
30	pshufd	$0, %xmm1, %xmm1
31	jg	L(cross_page)
32	movdqu	(%rdi), %xmm0
33	pxor	%xmm3, %xmm3
34	movdqa	%xmm0, %xmm4
35	pcmpeqb	%xmm1, %xmm0
36	pcmpeqb	%xmm3, %xmm4
37	por	%xmm4, %xmm0
38	pmovmskb %xmm0, %eax
39	test	%eax, %eax
40	je	L(next_48_bytes)
41	bsf	%eax, %eax
42#ifdef AS_STRCHRNUL
43	leaq	(%rdi,%rax), %rax
44#else
45	movl	$0, %edx
46	leaq	(%rdi,%rax), %rax
47	cmpb	%sil, (%rax)
48	cmovne	%rdx, %rax
49#endif
50	ret
51
52	.p2align 3
53	L(next_48_bytes):
54	movdqu	16(%rdi), %xmm0
55	movdqa	%xmm0, %xmm4
56	pcmpeqb	%xmm1, %xmm0
57	pcmpeqb	%xmm3, %xmm4
58	por	%xmm4, %xmm0
59	pmovmskb %xmm0, %ecx
60	movdqu	32(%rdi), %xmm0
61	movdqa	%xmm0, %xmm4
62	pcmpeqb	%xmm1, %xmm0
63	salq	$16, %rcx
64	pcmpeqb	%xmm3, %xmm4
65	por	%xmm4, %xmm0
66	pmovmskb %xmm0, %eax
67	movdqu	48(%rdi), %xmm0
68	pcmpeqb	%xmm0, %xmm3
69	salq	$32, %rax
70	pcmpeqb	%xmm1, %xmm0
71	orq	%rcx, %rax
72	por	%xmm3, %xmm0
73	pmovmskb %xmm0, %ecx
74	salq	$48, %rcx
75	orq	%rcx, %rax
76	testq	%rax, %rax
77	jne	L(return)
78L(loop_start):
79	/* We use this alignment to force loop be aligned to 8 but not
80	   16 bytes.  This gives better sheduling on AMD processors.  */
81	.p2align 4
82	pxor	%xmm6, %xmm6
83	andq	$-64, %rdi
84	.p2align 3
85L(loop64):
86	addq	$64, %rdi
87	movdqa	(%rdi), %xmm5
88	movdqa	16(%rdi), %xmm2
89	movdqa	32(%rdi), %xmm3
90	pxor	%xmm1, %xmm5
91	movdqa	48(%rdi), %xmm4
92	pxor	%xmm1, %xmm2
93	pxor	%xmm1, %xmm3
94	pminub	(%rdi), %xmm5
95	pxor	%xmm1, %xmm4
96	pminub	16(%rdi), %xmm2
97	pminub	32(%rdi), %xmm3
98	pminub	%xmm2, %xmm5
99	pminub	48(%rdi), %xmm4
100	pminub	%xmm3, %xmm5
101	pminub	%xmm4, %xmm5
102	pcmpeqb %xmm6, %xmm5
103	pmovmskb %xmm5, %eax
104
105	testl	%eax, %eax
106	je	L(loop64)
107
108	movdqa	(%rdi), %xmm5
109	movdqa	%xmm5, %xmm0
110	pcmpeqb	%xmm1, %xmm5
111	pcmpeqb	%xmm6, %xmm0
112	por	%xmm0, %xmm5
113	pcmpeqb %xmm6, %xmm2
114	pcmpeqb %xmm6, %xmm3
115	pcmpeqb %xmm6, %xmm4
116
117	pmovmskb %xmm5, %ecx
118	pmovmskb %xmm2, %eax
119	salq	$16, %rax
120	pmovmskb %xmm3, %r8d
121	pmovmskb %xmm4, %edx
122	salq	$32, %r8
123	orq	%r8, %rax
124	orq	%rcx, %rax
125	salq	$48, %rdx
126	orq	%rdx, %rax
127	.p2align 3
128L(return):
129	bsfq	%rax, %rax
130#ifdef AS_STRCHRNUL
131	leaq	(%rdi,%rax), %rax
132#else
133	movl	$0, %edx
134	leaq	(%rdi,%rax), %rax
135	cmpb	%sil, (%rax)
136	cmovne	%rdx, %rax
137#endif
138	ret
139	.p2align 4
140
141L(cross_page):
142	movq	%rdi, %rdx
143	pxor	%xmm2, %xmm2
144	andq	$-64, %rdx
145	movdqa	%xmm1, %xmm0
146	movdqa	(%rdx), %xmm3
147	movdqa	%xmm3, %xmm4
148	pcmpeqb	%xmm1, %xmm3
149	pcmpeqb	%xmm2, %xmm4
150	por	%xmm4, %xmm3
151	pmovmskb %xmm3, %r8d
152	movdqa	16(%rdx), %xmm3
153	movdqa	%xmm3, %xmm4
154	pcmpeqb	%xmm1, %xmm3
155	pcmpeqb	%xmm2, %xmm4
156	por	%xmm4, %xmm3
157	pmovmskb %xmm3, %eax
158	movdqa	32(%rdx), %xmm3
159	movdqa	%xmm3, %xmm4
160	pcmpeqb	%xmm1, %xmm3
161	salq	$16, %rax
162	pcmpeqb	%xmm2, %xmm4
163	por	%xmm4, %xmm3
164	pmovmskb %xmm3, %r9d
165	movdqa	48(%rdx), %xmm3
166	pcmpeqb	%xmm3, %xmm2
167	salq	$32, %r9
168	pcmpeqb	%xmm3, %xmm0
169	orq	%r9, %rax
170	orq	%r8, %rax
171	por	%xmm2, %xmm0
172	pmovmskb %xmm0, %ecx
173	salq	$48, %rcx
174	orq	%rcx, %rax
175	movl	%edi, %ecx
176	subb	%dl, %cl
177	shrq	%cl, %rax
178	testq	%rax, %rax
179	jne	L(return)
180	jmp	L(loop_start)
181
182END (strchr)
183
184#ifndef AS_STRCHRNUL
185weak_alias (strchr, index)
186libc_hidden_builtin_def (strchr)
187#endif
188