1/* Optimized strlen implementation for PowerPC.
2   Copyright (C) 1997-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* The algorithm here uses the following techniques:
22
23   1) Given a word 'x', we can test to see if it contains any 0 bytes
24      by subtracting 0x01010101, and seeing if any of the high bits of each
25      byte changed from 0 to 1. This works because the least significant
26      0 byte must have had no incoming carry (otherwise it's not the least
27      significant), so it is 0x00 - 0x01 == 0xff. For all other
28      byte values, either they have the high bit set initially, or when
29      1 is subtracted you get a value in the range 0x00-0x7f, none of which
30      have their high bit set. The expression here is
31      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
32      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
33      match, but possibly false 0x80 matches in the next more significant
34      byte to a true match due to carries.  For little-endian this is
35      of no consequence since the least significant match is the one
36      we're interested in, but big-endian needs method 2 to find which
37      byte matches.
38
39   2) Given a word 'x', we can test to see _which_ byte was zero by
40      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
41      This produces 0x80 in each byte that was zero, and 0x00 in all
42      the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
43      byte, and the '| x' part ensures that bytes with the high bit set
44      produce 0x00. The addition will carry into the high bit of each byte
45      iff that byte had one of its low 7 bits set. We can then just see
46      which was the most significant bit set and divide by 8 to find how
47      many to add to the index.
48      This is from the book 'The PowerPC Compiler Writer's Guide',
49      by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
50
51   We deal with strings not aligned to a word boundary by taking the
52   first word and ensuring that bytes not part of the string
53   are treated as nonzero. To allow for memory latency, we unroll the
54   loop a few times, being careful to ensure that we do not read ahead
55   across cache line boundaries.
56
57   Questions to answer:
58   1) How long are strings passed to strlen? If they're often really long,
59   we should probably use cache management instructions and/or unroll the
60   loop more. If they're often quite short, it might be better to use
61   fact (2) in the inner loop than have to recalculate it.
62   2) How popular are bytes with the high bit set? If they are very rare,
63   on some processors it might be useful to use the simpler expression
64   ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
65   ALU), but this fails when any character has its high bit set.  */
66
67/* Some notes on register usage: Under the SVR4 ABI, we can use registers
68   0 and 3 through 12 (so long as we don't call any procedures) without
69   saving them. We can also use registers 14 through 31 if we save them.
70   We can't use r1 (it's the stack pointer), r2 nor r13 because the user
71   program may expect them to hold their usual value if we get sent
72   a signal. Integer parameters are passed in r3 through r10.
73   We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
74   them, the others we must save.  */
75
76/* int [r3] strlen (char *s [r3])  */
77
78ENTRY (strlen)
79
80#define rTMP4	r0
81#define rRTN	r3	/* incoming STR arg, outgoing result */
82#define rSTR	r4	/* current string position */
83#define rPADN	r5	/* number of padding bits we prepend to the
84			   string to make it start at a word boundary */
85#define rFEFE	r6	/* constant 0xfefefeff (-0x01010101) */
86#define r7F7F	r7	/* constant 0x7f7f7f7f */
87#define rWORD1	r8	/* current string word */
88#define rWORD2	r9	/* next string word */
89#define rMASK	r9	/* mask for first string word */
90#define rTMP1	r10
91#define rTMP2	r11
92#define rTMP3	r12
93
94
95	clrrwi	rSTR, rRTN, 2
96	lis	r7F7F, 0x7f7f
97	rlwinm	rPADN, rRTN, 3, 27, 28
98	lwz	rWORD1, 0(rSTR)
99	li	rMASK, -1
100	addi	r7F7F, r7F7F, 0x7f7f
101/* We use method (2) on the first two words, because rFEFE isn't
102   required which reduces setup overhead.  Also gives a faster return
103   for small strings on big-endian due to needing to recalculate with
104   method (2) anyway.  */
105#ifdef __LITTLE_ENDIAN__
106	slw	rMASK, rMASK, rPADN
107#else
108	srw	rMASK, rMASK, rPADN
109#endif
110	and	rTMP1, r7F7F, rWORD1
111	or	rTMP2, r7F7F, rWORD1
112	add	rTMP1, rTMP1, r7F7F
113	nor	rTMP3, rTMP2, rTMP1
114	and.	rTMP3, rTMP3, rMASK
115	mtcrf	0x01, rRTN
116	bne	L(done0)
117	lis	rFEFE, -0x101
118	addi	rFEFE, rFEFE, -0x101
119/* Are we now aligned to a doubleword boundary?  */
120	bt	29, L(loop)
121
122/* Handle second word of pair.  */
123/* Perhaps use method (1) here for little-endian, saving one instruction?  */
124	lwzu	rWORD1, 4(rSTR)
125	and	rTMP1, r7F7F, rWORD1
126	or	rTMP2, r7F7F, rWORD1
127	add	rTMP1, rTMP1, r7F7F
128	nor.	rTMP3, rTMP2, rTMP1
129	bne	L(done0)
130
131/* The loop.  */
132
133L(loop):
134	lwz	rWORD1, 4(rSTR)
135	lwzu	rWORD2, 8(rSTR)
136	add	rTMP1, rFEFE, rWORD1
137	nor	rTMP2, r7F7F, rWORD1
138	and.	rTMP1, rTMP1, rTMP2
139	add	rTMP3, rFEFE, rWORD2
140	nor	rTMP4, r7F7F, rWORD2
141	bne	L(done1)
142	and.	rTMP3, rTMP3, rTMP4
143	beq	L(loop)
144
145#ifndef __LITTLE_ENDIAN__
146	and	rTMP1, r7F7F, rWORD2
147	add	rTMP1, rTMP1, r7F7F
148	andc	rTMP3, rTMP4, rTMP1
149	b	L(done0)
150
151L(done1):
152	and	rTMP1, r7F7F, rWORD1
153	subi	rSTR, rSTR, 4
154	add	rTMP1, rTMP1, r7F7F
155	andc	rTMP3, rTMP2, rTMP1
156
157/* When we get to here, rSTR points to the first word in the string that
158   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
159   and 0x00 otherwise.  */
160L(done0):
161	cntlzw	rTMP3, rTMP3
162	subf	rTMP1, rRTN, rSTR
163	srwi	rTMP3, rTMP3, 3
164	add	rRTN, rTMP1, rTMP3
165	blr
166#else
167
168L(done0):
169	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
170	andc	rTMP1, rTMP1, rTMP3
171	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
172	subf	rTMP3, rRTN, rSTR
173	subfic	rTMP1, rTMP1, 32-7
174	srwi	rTMP1, rTMP1, 3
175	add	rRTN, rTMP1, rTMP3
176	blr
177
178L(done1):
179	addi	rTMP3, rTMP1, -1
180	andc	rTMP3, rTMP3, rTMP1
181	cntlzw	rTMP3, rTMP3
182	subf	rTMP1, rRTN, rSTR
183	subfic	rTMP3, rTMP3, 32-7-32
184	srawi	rTMP3, rTMP3, 3
185	add	rRTN, rTMP1, rTMP3
186	blr
187#endif
188
189END (strlen)
190libc_hidden_builtin_def (strlen)
191