1/* Find character CH in a NUL terminated string.
2   Highly optimized version for ix85, x>=5.
3   Copyright (C) 1995-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include "asm-syntax.h"
22
23/* This version is especially optimized for the i586 (and following?)
24   processors.  This is mainly done by using the two pipelines.  The
25   version optimized for i486 is weak in this aspect because to get
26   as much parallelism we have to execute some *more* instructions.
27
28   The code below is structured to reflect the pairing of the instructions
29   as *I think* it is.  I have no processor data book to verify this.
30   If you find something you think is incorrect let me know.  */
31
32
33/* The magic value which is used throughout in the whole code.  */
34#define magic 0xfefefeff
35
36#define PARMS	4+16	/* space for 4 saved regs */
37#define RTN	PARMS
38#define STR	RTN
39#define CHR	STR+4
40
41	.text
42ENTRY (strchr)
43
44	pushl %edi		/* Save callee-safe registers.  */
45	cfi_adjust_cfa_offset (-4)
46	pushl %esi
47	cfi_adjust_cfa_offset (-4)
48
49	pushl %ebx
50	cfi_adjust_cfa_offset (-4)
51	pushl %ebp
52	cfi_adjust_cfa_offset (-4)
53
54	movl STR(%esp), %eax
55	movl CHR(%esp), %edx
56
57	movl %eax, %edi		/* duplicate string pointer for later */
58	cfi_rel_offset (edi, 12)
59	xorl %ecx, %ecx		/* clear %ecx */
60
61	/* At the moment %edx contains C.  What we need for the
62	   algorithm is C in all bytes of the dword.  Avoid
63	   operations on 16 bit words because these require an
64	   prefix byte (and one more cycle).  */
65	movb %dl, %dh		/* now it is 0|0|c|c */
66	movb %dl, %cl		/* we construct the lower half in %ecx */
67
68	shll $16, %edx		/* now %edx is c|c|0|0 */
69	movb %cl, %ch		/* now %ecx is 0|0|c|c */
70
71	orl %ecx, %edx		/* and finally c|c|c|c */
72	andl $3, %edi		/* mask alignment bits */
73
74	jz L(11)		/* alignment is 0 => start loop */
75
76	movb %dl, %cl		/* 0 is needed below */
77	jp L(0)			/* exactly two bits set */
78
79	xorb (%eax), %cl	/* is byte the one we are looking for? */
80	jz L(out)		/* yes => return pointer */
81
82	xorb %dl, %cl		/* load single byte and test for NUL */
83	je L(3)			/* yes => return NULL */
84
85	movb 1(%eax), %cl	/* load single byte */
86	incl %eax
87
88	cmpb %cl, %dl		/* is byte == C? */
89	je L(out)		/* aligned => return pointer */
90
91	cmpb $0, %cl		/* is byte NUL? */
92	je L(3)			/* yes => return NULL */
93
94	incl %eax
95	decl %edi
96
97	jne L(11)
98
99L(0):	movb (%eax), %cl	/* load single byte */
100
101	cmpb %cl, %dl		/* is byte == C? */
102	je L(out)		/* aligned => return pointer */
103
104	cmpb $0, %cl		/* is byte NUL? */
105	je L(3)			/* yes => return NULL */
106
107	incl %eax		/* increment pointer */
108
109	cfi_rel_offset (esi, 8)
110	cfi_rel_offset (ebx, 4)
111	cfi_rel_offset (ebp, 0)
112
113	/* The following code is the preparation for the loop.  The
114	   four instruction up to `L1' will not be executed in the loop
115	   because the same code is found at the end of the loop, but
116	   there it is executed in parallel with other instructions.  */
117L(11):	movl (%eax), %ecx
118	movl $magic, %ebp
119
120	movl $magic, %edi
121	addl %ecx, %ebp
122
123	/* The main loop: it looks complex and indeed it is.  I would
124	   love to say `it was hard to write, so it should he hard to
125	   read' but I will give some more hints.  To fully understand
126	   this code you should first take a look at the i486 version.
127	   The basic algorithm is the same, but here the code organized
128	   in a way which permits to use both pipelines all the time.
129
130	   I tried to make it a bit more understandable by indenting
131	   the code according to stage in the algorithm.  It goes as
132	   follows:
133		check for 0 in 1st word
134			check for C in 1st word
135					check for 0 in 2nd word
136						check for C in 2nd word
137		check for 0 in 3rd word
138			check for C in 3rd word
139					check for 0 in 4th word
140						check for C in 4th word
141
142	   Please note that doing the test for NUL before the test for
143	   C allows us to overlap the test for 0 in the next word with
144	   the test for C.  */
145
146L(1):	xorl %ecx, %ebp			/* (word^magic) */
147	addl %ecx, %edi			/* add magic word */
148
149	leal 4(%eax), %eax		/* increment pointer */
150	jnc L(4)			/* previous addl caused overflow? */
151
152		movl %ecx, %ebx		/* duplicate original word */
153	orl $magic, %ebp		/* (word^magic)|magic */
154
155	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
156	jne L(4)				/* yes => we found word with NUL */
157
158		movl $magic, %esi	/* load magic value */
159		xorl %edx, %ebx		/* clear words which are C */
160
161					movl (%eax), %ecx
162		addl %ebx, %esi		/* (word+magic) */
163
164					movl $magic, %edi
165		jnc L(5)		/* previous addl caused overflow? */
166
167					movl %edi, %ebp
168		xorl %ebx, %esi		/* (word+magic)^word */
169
170					addl %ecx, %ebp
171		orl $magic, %esi	/* ((word+magic)^word)|magic */
172
173		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
174		jne L(5)		/* yes => we found word with C */
175
176					xorl %ecx, %ebp
177					addl %ecx, %edi
178
179					leal 4(%eax), %eax
180					jnc L(4)
181
182						movl %ecx, %ebx
183					orl $magic, %ebp
184
185					addl $1, %ebp
186					jne L(4)
187
188						movl $magic, %esi
189						xorl %edx, %ebx
190
191	movl (%eax), %ecx
192						addl %ebx, %esi
193
194	movl $magic, %edi
195						jnc L(5)
196
197	movl %edi, %ebp
198						xorl %ebx, %esi
199
200	addl %ecx, %ebp
201						orl $magic, %esi
202
203						addl $1, %esi
204						jne L(5)
205
206	xorl %ecx, %ebp
207	addl %ecx, %edi
208
209	leal 4(%eax), %eax
210	jnc L(4)
211
212		movl %ecx, %ebx
213	orl $magic, %ebp
214
215	addl $1, %ebp
216	jne L(4)
217
218		movl $magic, %esi
219		xorl %edx, %ebx
220
221					movl (%eax), %ecx
222		addl %ebx, %esi
223
224					movl $magic, %edi
225		jnc L(5)
226
227					movl %edi, %ebp
228		xorl %ebx, %esi
229
230					addl %ecx, %ebp
231		orl $magic, %esi
232
233		addl $1, %esi
234		jne L(5)
235
236					xorl %ecx, %ebp
237					addl %ecx, %edi
238
239					leal 4(%eax), %eax
240					jnc L(4)
241
242						movl %ecx, %ebx
243					orl $magic, %ebp
244
245					addl $1, %ebp
246					jne L(4)
247
248						movl $magic, %esi
249						xorl %edx, %ebx
250
251	movl (%eax), %ecx
252						addl %ebx, %esi
253
254	movl $magic, %edi
255						jnc L(5)
256
257	movl %edi, %ebp
258						xorl %ebx, %esi
259
260	addl %ecx, %ebp
261						orl $magic, %esi
262
263						addl $1, %esi
264
265						je L(1)
266
267	/* We know there is no NUL byte but a C byte in the word.
268	   %ebx contains NUL in this particular byte.  */
269L(5):	subl $4, %eax		/* adjust pointer */
270	testb %bl, %bl		/* first byte == C? */
271
272	jz L(out)		/* yes => return pointer */
273
274	incl %eax		/* increment pointer */
275	testb %bh, %bh		/* second byte == C? */
276
277	jz L(out)		/* yes => return pointer */
278
279	shrl $16, %ebx		/* make upper bytes accessible */
280	incl %eax		/* increment pointer */
281
282	cmp $0, %bl		/* third byte == C */
283	je L(out)		/* yes => return pointer */
284
285	incl %eax		/* increment pointer */
286
287L(out):	popl %ebp		/* restore saved registers */
288	cfi_adjust_cfa_offset (-4)
289	cfi_restore (ebp)
290	popl %ebx
291	cfi_adjust_cfa_offset (-4)
292	cfi_restore (ebx)
293
294	popl %esi
295	cfi_adjust_cfa_offset (-4)
296	cfi_restore (esi)
297	popl %edi
298	cfi_adjust_cfa_offset (-4)
299	cfi_restore (edi)
300
301	ret
302
303	cfi_adjust_cfa_offset (16)
304	cfi_rel_offset (edi, 12)
305	cfi_rel_offset (esi, 8)
306	cfi_rel_offset (ebx, 4)
307	cfi_rel_offset (ebp, 0)
308	/* We know there is a NUL byte in the word.  But we have to test
309	   whether there is an C byte before it in the word.  */
310L(4):	subl $4, %eax		/* adjust pointer */
311	cmpb %dl, %cl		/* first byte == C? */
312
313	je L(out)		/* yes => return pointer */
314
315	cmpb $0, %cl		/* first byte == NUL? */
316	je L(3)			/* yes => return NULL */
317
318	incl %eax		/* increment pointer */
319
320	cmpb %dl, %ch		/* second byte == C? */
321	je L(out)		/* yes => return pointer */
322
323	cmpb $0, %ch		/* second byte == NUL? */
324	je L(3)			/* yes => return NULL */
325
326	shrl $16, %ecx		/* make upper bytes accessible */
327	incl %eax		/* increment pointer */
328
329	cmpb %dl, %cl		/* third byte == C? */
330	je L(out)		/* yes => return pointer */
331
332	cmpb $0, %cl		/* third byte == NUL? */
333	je L(3)			/* yes => return NULL */
334
335	incl %eax		/* increment pointer */
336
337	/* The test four the fourth byte is necessary!  */
338	cmpb %dl, %ch		/* fourth byte == C? */
339	je L(out)		/* yes => return pointer */
340
341L(3):	xorl %eax, %eax
342	jmp L(out)
343END (strchr)
344
345#undef index
346weak_alias (strchr, index)
347libc_hidden_builtin_def (strchr)
348