1/* Copyright (C) 2012-2021 Free Software Foundation, Inc.
2
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* Assumptions:
22 *
23 * ARMv8-a, AArch64, Advanced SIMD.
24 * MTE compatible.
25 */
26
27#ifndef STRLEN
28# define STRLEN __strlen
29#endif
30
31#define srcin		x0
32#define result		x0
33
34#define src		x1
35#define	synd		x2
36#define tmp		x3
37#define wtmp		w3
38#define shift		x4
39
40#define data		q0
41#define vdata		v0
42#define vhas_nul	v1
43#define vrepmask	v2
44#define vend		v3
45#define dend		d3
46
47/* Core algorithm:
48
49   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
50   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
51   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
52   set likewise for odd bytes so that adjacent bytes can be merged. Since the
53   bits in the syndrome reflect the order in which things occur in the original
54   string, counting trailing zeros identifies exactly which byte matched.  */
55
56ENTRY (STRLEN)
57	PTR_ARG (0)
58	bic	src, srcin, 15
59	mov	wtmp, 0xf00f
60	ld1	{vdata.16b}, [src]
61	dup	vrepmask.8h, wtmp
62	cmeq	vhas_nul.16b, vdata.16b, 0
63	lsl	shift, srcin, 2
64	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
65	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
66	fmov	synd, dend
67	lsr	synd, synd, shift
68	cbz	synd, L(loop)
69
70	rbit	synd, synd
71	clz	result, synd
72	lsr	result, result, 2
73	ret
74
75	.p2align 5
76L(loop):
77	ldr	data, [src, 16]!
78	cmeq	vhas_nul.16b, vdata.16b, 0
79	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
80	fmov	synd, dend
81	cbz	synd, L(loop)
82
83	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
84	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
85	sub	result, src, srcin
86	fmov	synd, dend
87#ifndef __AARCH64EB__
88	rbit	synd, synd
89#endif
90	clz	tmp, synd
91	add	result, result, tmp, lsr 2
92	ret
93
94END (STRLEN)
95weak_alias (STRLEN, strlen)
96libc_hidden_builtin_def (strlen)
97