1/* memrchr - find the last occurrence of a byte in a memory block
2
3   Copyright (C) 2015-2021 Free Software Foundation, Inc.
4
5   This file is part of the GNU C Library.
6
7   The GNU C Library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 2.1 of the License, or (at your option) any later version.
11
12   The GNU C Library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public
18   License along with the GNU C Library.  If not, see
19   <https://www.gnu.org/licenses/>.  */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD.
26 * MTE compatible.
27 */
28
29/* Arguments and results.  */
30#define srcin		x0
31#define chrin		w1
32#define cntin		x2
33#define result		x0
34
35#define src		x3
36#define cntrem		x4
37#define synd		x5
38#define shift		x6
39#define	tmp		x7
40#define wtmp		w7
41#define end		x8
42#define endm1		x9
43
44#define vrepchr		v0
45#define qdata		q1
46#define vdata		v1
47#define vhas_chr	v2
48#define vrepmask	v3
49#define vend		v4
50#define dend		d4
51
52/*
53   Core algorithm:
54   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
55   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
56   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
57   set likewise for odd bytes so that adjacent bytes can be merged. Since the
58   bits in the syndrome reflect the order in which things occur in the original
59   string, counting trailing zeros identifies exactly which byte matched.  */
60
61ENTRY (__memrchr)
62	PTR_ARG (0)
63	SIZE_ARG (2)
64	add	end, srcin, cntin
65	sub	endm1, end, 1
66	bic	src, endm1, 15
67	cbz	cntin, L(nomatch)
68	ld1	{vdata.16b}, [src]
69	dup	vrepchr.16b, chrin
70	mov	wtmp, 0xf00f
71	dup	vrepmask.8h, wtmp
72	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
73	neg	shift, end, lsl 2
74	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
75	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
76	fmov	synd, dend
77	lsl	synd, synd, shift
78	cbz	synd, L(start_loop)
79
80	clz	synd, synd
81	sub	result, endm1, synd, lsr 2
82	cmp	cntin, synd, lsr 2
83	csel	result, result, xzr, hi
84	ret
85
86L(start_loop):
87	sub	tmp, end, src
88	subs	cntrem, cntin, tmp
89	b.ls	L(nomatch)
90
91	/* Make sure that it won't overread by a 16-byte chunk */
92	add	tmp, cntrem, 15
93	tbnz	tmp, 4, L(loop32_2)
94
95	.p2align 4
96L(loop32):
97	ldr	qdata, [src, -16]!
98	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
99	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
100	fmov	synd, dend
101	cbnz	synd, L(end)
102
103L(loop32_2):
104	ldr	qdata, [src, -16]!
105	subs	cntrem, cntrem, 32
106	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
107	b.ls	L(end)
108	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
109	fmov	synd, dend
110	cbz	synd, L(loop32)
111L(end):
112	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
113	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
114	fmov	synd, dend
115
116	add	tmp, src, 15
117#ifdef __AARCH64EB__
118	rbit	synd, synd
119#endif
120	clz	synd, synd
121	sub	tmp, tmp, synd, lsr 2
122	cmp	tmp, srcin
123	csel	result, tmp, xzr, hs
124	ret
125
126L(nomatch):
127	mov	result, 0
128	ret
129
130END (__memrchr)
131weak_alias (__memrchr, memrchr)
132libc_hidden_builtin_def (memrchr)
133