1/* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR
2	 less than LEN.  For Intel 80x86, x>=3.
3   Copyright (C) 1994-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include "asm-syntax.h"
22
23#define PARMS	4+8		/* space for 2 saved regs */
24#define RTN	PARMS
25#define STR	RTN
26#define CHR	STR+4
27#define LEN	CHR+4
28
29	.text
30ENTRY (__memchr)
31
32	/* Save callee-safe registers used in this function.  */
33	pushl %esi
34	cfi_adjust_cfa_offset (4)
35	pushl %edi
36	cfi_adjust_cfa_offset (4)
37	cfi_rel_offset (edi, 0)
38
39	/* Load parameters into registers.  */
40	movl STR(%esp), %eax	/* str: pointer to memory block.  */
41	movl CHR(%esp), %edx	/* c: byte we are looking for.  */
42	movl LEN(%esp), %esi	/* len: length of memory block.  */
43	cfi_rel_offset (esi, 4)
44
45	/* If my must not test more than three characters test
46	   them one by one.  This is especially true for 0.  */
47	cmpl $4, %esi
48	jb L(3)
49
50	/* At the moment %edx contains CHR.  What we need for the
51	   algorithm is CHR in all bytes of the dword.  Avoid
52	   operations on 16 bit words because these require an
53	   prefix byte (and one more cycle).  */
54	movb %dl, %dh		/* Now it is 0|0|c|c */
55	movl %edx, %ecx
56	shll $16, %edx		/* Now c|c|0|0 */
57	movw %cx, %dx		/* And finally c|c|c|c */
58
59	/* Better performance can be achieved if the word (32
60	   bit) memory access is aligned on a four-byte-boundary.
61	   So process first bytes one by one until boundary is
62	   reached. Don't use a loop for better performance.  */
63
64	testb $3, %al		/* correctly aligned ? */
65	je L(2)			/* yes => begin loop */
66	cmpb %dl, (%eax)	/* compare byte */
67	je L(9)			/* target found => return */
68	incl %eax		/* increment source pointer */
69	decl %esi		/* decrement length counter */
70	je L(4)			/* len==0 => return NULL */
71
72	testb $3, %al		/* correctly aligned ? */
73	je L(2)			/* yes => begin loop */
74	cmpb %dl, (%eax)	/* compare byte */
75	je L(9)			/* target found => return */
76	incl %eax		/* increment source pointer */
77	decl %esi		/* decrement length counter */
78	je L(4)			/* len==0 => return NULL */
79
80	testb $3, %al		/* correctly aligned ? */
81	je L(2)			/* yes => begin loop */
82	cmpb %dl, (%eax)	/* compare byte */
83	je L(9)			/* target found => return */
84	incl %eax		/* increment source pointer */
85	decl %esi		/* decrement length counter */
86	/* no test for len==0 here, because this is done in the
87	   loop head */
88	jmp L(2)
89
90      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
91	 change any of the hole bits of LONGWORD.
92
93	 1) Is this safe?  Will it catch all the zero bytes?
94	 Suppose there is a byte with all zeros.  Any carry bits
95	 propagating from its left will fall into the hole at its
96	 least significant bit and stop.  Since there will be no
97	 carry from its most significant bit, the LSB of the
98	 byte to the left will be unchanged, and the zero will be
99	 detected.
100
101	 2) Is this worthwhile?  Will it ignore everything except
102	 zero bytes?  Suppose every byte of LONGWORD has a bit set
103	 somewhere.  There will be a carry into bit 8.	If bit 8
104	 is set, this will carry into bit 16.  If bit 8 is clear,
105	 one of bits 9-15 must be set, so there will be a carry
106	 into bit 16.  Similarly, there will be a carry into bit
107	 24.  If one of bits 24-31 is set, there will be a carry
108	 into bit 32 (=carry flag), so all of the hole bits will
109	 be changed.
110
111	 3) But wait!  Aren't we looking for CHR, not zero?
112	 Good point.  So what we do is XOR LONGWORD with a longword,
113	 each of whose bytes is CHR.  This turns each byte that is CHR
114	 into a zero.  */
115
116
117	/* Each round the main loop processes 16 bytes.  */
118
119	ALIGN (4)
120
121L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
122	movl $0xfefefeff, %edi	/* magic value */
123	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
124				   are now 0 */
125	addl %ecx, %edi		/* add the magic value to the word.  We get
126				   carry bits reported for each byte which
127				   is *not* 0 */
128
129	/* According to the algorithm we had to reverse the effect of the
130	   XOR first and then test the overflow bits.  But because the
131	   following XOR would destroy the carry flag and it would (in a
132	   representation with more than 32 bits) not alter then last
133	   overflow, we can now test this condition.  If no carry is signaled
134	   no overflow must have occurred in the last byte => it was 0.	*/
135	jnc L(8)
136
137	/* We are only interested in carry bits that change due to the
138	   previous add, so remove original bits */
139	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
140
141	/* Now test for the other three overflow bits.  */
142	orl $0xfefefeff, %edi	/* set all non-carry bits */
143	incl %edi		/* add 1: if one carry bit was *not* set
144				   the addition will not result in 0.  */
145
146	/* If at least one byte of the word is CHR we don't get 0 in %edi.  */
147	jnz L(8)		/* found it => return pointer */
148
149	/* This process is unfolded four times for better performance.
150	   we don't increment the source pointer each time.  Instead we
151	   use offsets and increment by 16 in each run of the loop.  But
152	   before probing for the matching byte we need some extra code
153	   (following LL(13) below).  Even the len can be compared with
154	   constants instead of decrementing each time.  */
155
156	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
157	movl $0xfefefeff, %edi	/* magic value */
158	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
159				   are now 0 */
160	addl %ecx, %edi		/* add the magic value to the word.  We get
161				   carry bits reported for each byte which
162				   is *not* 0 */
163	jnc L(7)		/* highest byte is CHR => return pointer */
164	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
165	orl $0xfefefeff, %edi	/* set all non-carry bits */
166	incl %edi		/* add 1: if one carry bit was *not* set
167				   the addition will not result in 0.  */
168	jnz L(7)		/* found it => return pointer */
169
170	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
171	movl $0xfefefeff, %edi	/* magic value */
172	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
173				   are now 0 */
174	addl %ecx, %edi		/* add the magic value to the word.  We get
175				   carry bits reported for each byte which
176				   is *not* 0 */
177	jnc L(6)		/* highest byte is CHR => return pointer */
178	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
179	orl $0xfefefeff, %edi	/* set all non-carry bits */
180	incl %edi		/* add 1: if one carry bit was *not* set
181				   the addition will not result in 0.  */
182	jnz L(6)		/* found it => return pointer */
183
184	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
185	movl $0xfefefeff, %edi	/* magic value */
186	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
187				   are now 0 */
188	addl %ecx, %edi		/* add the magic value to the word.  We get
189				   carry bits reported for each byte which
190				   is *not* 0 */
191	jnc L(5)		/* highest byte is CHR => return pointer */
192	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
193	orl $0xfefefeff, %edi	/* set all non-carry bits */
194	incl %edi		/* add 1: if one carry bit was *not* set
195				   the addition will not result in 0.  */
196	jnz L(5)		/* found it => return pointer */
197
198	/* Adjust both counters for a full round, i.e. 16 bytes.  */
199	addl $16, %eax
200L(2):	subl $16, %esi
201	jae L(1)		/* Still more than 16 bytes remaining */
202
203	/* Process remaining bytes separately.  */
204	cmpl $4-16, %esi	/* rest < 4 bytes? */
205	jb L(3)			/* yes, than test byte by byte */
206
207	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
208	movl $0xfefefeff, %edi	/* magic value */
209	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
210				   are now 0 */
211	addl %ecx, %edi		/* add the magic value to the word.  We get
212				   carry bits reported for each byte which
213				   is *not* 0 */
214	jnc L(8)		/* highest byte is CHR => return pointer */
215	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
216	orl $0xfefefeff, %edi	/* set all non-carry bits */
217	incl %edi		/* add 1: if one carry bit was *not* set
218				   the addition will not result in 0.  */
219	jne L(8)		/* found it => return pointer */
220	addl $4, %eax		/* adjust source pointer */
221
222	cmpl $8-16, %esi	/* rest < 8 bytes? */
223	jb L(3)			/* yes, than test byte by byte */
224
225	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
226	movl $0xfefefeff, %edi	/* magic value */
227	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
228				   are now 0 */
229	addl %ecx, %edi		/* add the magic value to the word.  We get
230				   carry bits reported for each byte which
231				   is *not* 0 */
232	jnc L(8)		/* highest byte is CHR => return pointer */
233	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
234	orl $0xfefefeff, %edi	/* set all non-carry bits */
235	incl %edi		/* add 1: if one carry bit was *not* set
236				   the addition will not result in 0.  */
237	jne L(8)		/* found it => return pointer */
238	addl $4, %eax		/* adjust source pointer */
239
240	cmpl $12-16, %esi	/* rest < 12 bytes? */
241	jb L(3)			/* yes, than test byte by byte */
242
243	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
244	movl $0xfefefeff, %edi	/* magic value */
245	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
246				   are now 0 */
247	addl %ecx, %edi		/* add the magic value to the word.  We get
248				   carry bits reported for each byte which
249				   is *not* 0 */
250	jnc L(8)		/* highest byte is CHR => return pointer */
251	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
252	orl $0xfefefeff, %edi	/* set all non-carry bits */
253	incl %edi		/* add 1: if one carry bit was *not* set
254				   the addition will not result in 0.  */
255	jne L(8)		/* found it => return pointer */
256	addl $4, %eax		/* adjust source pointer */
257
258	/* Check the remaining bytes one by one.  */
259L(3):	andl $3, %esi		/* mask out uninteresting bytes */
260	jz L(4)			/* no remaining bytes => return NULL */
261
262	cmpb %dl, (%eax)	/* compare byte with CHR */
263	je L(9)			/* equal, than return pointer */
264	incl %eax		/* increment source pointer */
265	decl %esi		/* decrement length */
266	jz L(4)			/* no remaining bytes => return NULL */
267
268	cmpb %dl, (%eax)	/* compare byte with CHR */
269	je L(9)			/* equal, than return pointer */
270	incl %eax		/* increment source pointer */
271	decl %esi		/* decrement length */
272	jz L(4)			/* no remaining bytes => return NULL */
273
274	cmpb %dl, (%eax)	/* compare byte with CHR */
275	je L(9)			/* equal, than return pointer */
276
277L(4):	/* no byte found => return NULL */
278	xorl %eax, %eax
279	jmp L(9)
280
281	/* add missing source pointer increments */
282L(5):	addl $4, %eax
283L(6):	addl $4, %eax
284L(7):	addl $4, %eax
285
286	/* Test for the matching byte in the word.  %ecx contains a NUL
287	   char in the byte which originally was the byte we are looking
288	   at.  */
289L(8):	testb %cl, %cl		/* test first byte in dword */
290	jz L(9)			/* if zero => return pointer */
291	incl %eax		/* increment source pointer */
292
293	testb %ch, %ch		/* test second byte in dword */
294	jz L(9)			/* if zero => return pointer */
295	incl %eax		/* increment source pointer */
296
297	testl $0xff0000, %ecx	/* test third byte in dword */
298	jz L(9)			/* if zero => return pointer */
299	incl %eax		/* increment source pointer */
300
301	/* No further test needed we we know it is one of the four bytes.  */
302L(9):	popl %edi		/* pop saved registers */
303	cfi_adjust_cfa_offset (-4)
304	cfi_restore (edi)
305	popl %esi
306	cfi_adjust_cfa_offset (-4)
307	cfi_restore (esi)
308
309	ret
310END (__memchr)
311
312weak_alias (__memchr, memchr)
313libc_hidden_builtin_def (memchr)
314