1/* memchr - find a character in a memory zone
2
3   Copyright (C) 2015-2021 Free Software Foundation, Inc.
4
5   This file is part of the GNU C Library.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library.  If not, see
19   <https://www.gnu.org/licenses/>.  */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD.
26 * MTE compatible.
27 */
28
29#ifndef MEMCHR
30# define MEMCHR __memchr
31#endif
32
33/* Arguments and results.  */
34#define srcin		x0
35#define chrin		w1
36#define cntin		x2
37#define result		x0
38
39#define src		x3
40#define cntrem		x4
41#define synd		x5
42#define shift		x6
43#define	tmp		x7
44#define wtmp		w7
45
46#define vrepchr		v0
47#define qdata		q1
48#define vdata		v1
49#define vhas_chr	v2
50#define vrepmask	v3
51#define vend		v4
52#define dend		d4
53
54/*
55   Core algorithm:
56   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
57   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
58   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
59   set likewise for odd bytes so that adjacent bytes can be merged. Since the
60   bits in the syndrome reflect the order in which things occur in the original
61   string, counting trailing zeros identifies exactly which byte matched.  */
62
63ENTRY (MEMCHR)
64	PTR_ARG (0)
65	SIZE_ARG (2)
66	bic	src, srcin, 15
67	cbz	cntin, L(nomatch)
68	ld1	{vdata.16b}, [src]
69	dup	vrepchr.16b, chrin
70	mov	wtmp, 0xf00f
71	dup	vrepmask.8h, wtmp
72	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
73	lsl	shift, srcin, 2
74	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
75	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
76	fmov	synd, dend
77	lsr	synd, synd, shift
78	cbz	synd, L(start_loop)
79
80	rbit	synd, synd
81	clz	synd, synd
82	add	result, srcin, synd, lsr 2
83	cmp	cntin, synd, lsr 2
84	csel	result, result, xzr, hi
85	ret
86
87L(start_loop):
88	sub	tmp, src, srcin
89	add	tmp, tmp, 16
90	subs	cntrem, cntin, tmp
91	b.ls	L(nomatch)
92
93	/* Make sure that it won't overread by a 16-byte chunk */
94	add	tmp, cntrem, 15
95	tbnz	tmp, 4, L(loop32_2)
96
97	.p2align 4
98L(loop32):
99	ldr	qdata, [src, 16]!
100	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
101	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
102	fmov	synd, dend
103	cbnz	synd, L(end)
104
105L(loop32_2):
106	ldr	qdata, [src, 16]!
107	subs	cntrem, cntrem, 32
108	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
109	b.ls	L(end)
110	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
111	fmov	synd, dend
112	cbz	synd, L(loop32)
113L(end):
114	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
115	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
116	fmov	synd, dend
117	add	tmp, srcin, cntin
118	sub	cntrem, tmp, src
119#ifndef __AARCH64EB__
120	rbit	synd, synd
121#endif
122	clz	synd, synd
123	cmp	cntrem, synd, lsr 2
124	add	result, src, synd, lsr 2
125	csel	result, result, xzr, hi
126	ret
127
128L(nomatch):
129	mov	result, 0
130	ret
131
132END (MEMCHR)
133weak_alias (MEMCHR, memchr)
134libc_hidden_builtin_def (memchr)
135