1/* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2   Copyright (C) 2012-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <arm-features.h>
20#include <sysdep.h>
21
22/* Implementation of strcmp for ARMv7 when DSP instructions are
23   available.  Use ldrd to support wider loads, provided the data
24   is sufficiently aligned.  Use saturating arithmetic to optimize
25   the compares.  */
26
27/* Build Options:
28   STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29   string.  If comparing completely random strings the pre-check will
30   save time, since there is a very high probability of a mismatch in
31   the first character: we save significant overhead if this is the
32   common case.  However, if strings are likely to be identical (e.g.
33   because we're verifying a hit in a hash table), then this check
34   is largely redundant.  */
35
36#define STRCMP_PRECHECK	1
37
38	.syntax unified
39
40#ifdef __ARM_BIG_ENDIAN
41# define S2LO lsl
42# define S2LOEQ lsleq
43# define S2HI lsr
44# define MSB 0x000000ff
45# define LSB 0xff000000
46# define BYTE0_OFFSET 24
47# define BYTE1_OFFSET 16
48# define BYTE2_OFFSET 8
49# define BYTE3_OFFSET 0
50#else /* not  __ARM_BIG_ENDIAN */
51# define S2LO lsr
52# define S2LOEQ lsreq
53# define S2HI lsl
54# define BYTE0_OFFSET 0
55# define BYTE1_OFFSET 8
56# define BYTE2_OFFSET 16
57# define BYTE3_OFFSET 24
58# define MSB 0xff000000
59# define LSB 0x000000ff
60#endif /* not  __ARM_BIG_ENDIAN */
61
62/* Parameters and result.  */
63#define src1		r0
64#define src2		r1
65#define result		r0	/* Overlaps src1.  */
66
67/* Internal variables.  */
68#define tmp1		r4
69#define tmp2		r5
70#define const_m1	r12
71
72/* Additional internal variables for 64-bit aligned data.  */
73#define data1a		r2
74#define data1b		r3
75#define data2a		r6
76#define data2b		r7
77#define syndrome_a	tmp1
78#define syndrome_b	tmp2
79
80/* Additional internal variables for 32-bit aligned data.  */
81#define data1		r2
82#define data2		r3
83#define syndrome	tmp2
84
85
86	.thumb
87
88/* In Thumb code we can't use MVN with a register shift, but we do have ORN.  */
89.macro prepare_mask mask_reg, nbits_reg
90	S2HI \mask_reg, const_m1, \nbits_reg
91.endm
92.macro apply_mask data_reg, mask_reg
93	orn \data_reg, \data_reg, \mask_reg
94.endm
95
96	/* Macro to compute and return the result value for word-aligned
97	   cases.  */
98	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
99#ifdef __ARM_BIG_ENDIAN
100	/* If data1 contains a zero byte, then syndrome will contain a 1 in
101	   bit 7 of that byte.  Otherwise, the highest set bit in the
102	   syndrome will highlight the first different bit.  It is therefore
103	   sufficient to extract the eight bits starting with the syndrome
104	   bit.  */
105	clz	tmp1, \synd
106	lsl	r1, \d2, tmp1
107	.if \restore_r6
108	ldrd	r6, r7, [sp, #8]
109	.endif
110	lsl	\d1, \d1, tmp1
111	lsr	result, \d1, #24
112	ldrd	r4, r5, [sp], #16
113	cfi_remember_state
114	cfi_def_cfa_offset (0)
115	cfi_restore (r4)
116	cfi_restore (r5)
117	cfi_restore (r6)
118	cfi_restore (r7)
119	sub	result, result, r1, lsr #24
120	bx	lr
121#else
122	/* To use the big-endian trick we'd have to reverse all three words.
123	   that's slower than this approach.  */
124	rev	\synd, \synd
125	clz	tmp1, \synd
126	bic	tmp1, tmp1, #7
127	lsr	r1, \d2, tmp1
128	.if \restore_r6
129	ldrd	r6, r7, [sp, #8]
130	.endif
131	lsr	\d1, \d1, tmp1
132	and	result, \d1, #255
133	and	r1, r1, #255
134	ldrd	r4, r5, [sp], #16
135	cfi_remember_state
136	cfi_def_cfa_offset (0)
137	cfi_restore (r4)
138	cfi_restore (r5)
139	cfi_restore (r6)
140	cfi_restore (r7)
141	sub	result, result, r1
142
143	bx	lr
144#endif
145	.endm
146
147	.text
148	.p2align	5
149.Lstrcmp_start_addr:
150#if STRCMP_PRECHECK == 1
151.Lfastpath_exit:
152	sub	r0, r2, r3
153	bx	lr
154	nop
155#endif
156ENTRY (strcmp)
157#if STRCMP_PRECHECK == 1
158	ldrb	r2, [src1]
159	ldrb	r3, [src2]
160	cmp	r2, #1
161	it	cs
162	cmpcs	r2, r3
163	bne	.Lfastpath_exit
164#endif
165	strd	r4, r5, [sp, #-16]!
166	cfi_def_cfa_offset (16)
167	cfi_offset (r4, -16)
168	cfi_offset (r5, -12)
169	orr	tmp1, src1, src2
170	strd	r6, r7, [sp, #8]
171	cfi_offset (r6, -8)
172	cfi_offset (r7, -4)
173	mvn	const_m1, #0
174	lsl	r2, tmp1, #29
175	cbz	r2, .Lloop_aligned8
176
177.Lnot_aligned:
178	eor	tmp1, src1, src2
179	tst	tmp1, #7
180	bne	.Lmisaligned8
181
182	/* Deal with mutual misalignment by aligning downwards and then
183	   masking off the unwanted loaded data to prevent a difference.  */
184	and	tmp1, src1, #7
185	bic	src1, src1, #7
186	and	tmp2, tmp1, #3
187	bic	src2, src2, #7
188	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
189	ldrd	data1a, data1b, [src1], #16
190	tst	tmp1, #4
191	ldrd	data2a, data2b, [src2], #16
192	prepare_mask tmp1, tmp2
193	apply_mask data1a, tmp1
194	apply_mask data2a, tmp1
195	beq	.Lstart_realigned8
196	apply_mask data1b, tmp1
197	mov	data1a, const_m1
198	apply_mask data2b, tmp1
199	mov	data2a, const_m1
200	b	.Lstart_realigned8
201
202	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
203	   pass.  */
204	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
205	.p2align 2	/* Always word aligned.  */
206.Lloop_aligned8:
207	ldrd	data1a, data1b, [src1], #16
208	ldrd	data2a, data2b, [src2], #16
209.Lstart_realigned8:
210	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
211	eor	syndrome_a, data1a, data2a
212	sel	syndrome_a, syndrome_a, const_m1
213	cbnz	syndrome_a, .Ldiff_in_a
214	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
215	eor	syndrome_b, data1b, data2b
216	sel	syndrome_b, syndrome_b, const_m1
217	cbnz	syndrome_b, .Ldiff_in_b
218
219	ldrd	data1a, data1b, [src1, #-8]
220	ldrd	data2a, data2b, [src2, #-8]
221	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
222	eor	syndrome_a, data1a, data2a
223	sel	syndrome_a, syndrome_a, const_m1
224	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
225	eor	syndrome_b, data1b, data2b
226	sel	syndrome_b, syndrome_b, const_m1
227	/* Can't use CBZ for backwards branch.  */
228	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
229	beq	.Lloop_aligned8
230
231.Ldiff_found:
232	cbnz	syndrome_a, .Ldiff_in_a
233
234.Ldiff_in_b:
235	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
236
237.Ldiff_in_a:
238	cfi_restore_state
239	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
240
241	cfi_restore_state
242.Lmisaligned8:
243	tst	tmp1, #3
244	bne	.Lmisaligned4
245	ands	tmp1, src1, #3
246	bne	.Lmutual_align4
247
248	/* Unrolled by a factor of 2, to reduce the number of post-increment
249	   operations.  */
250.Lloop_aligned4:
251	ldr	data1, [src1], #8
252	ldr	data2, [src2], #8
253.Lstart_realigned4:
254	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
255	eor	syndrome, data1, data2
256	sel	syndrome, syndrome, const_m1
257	cbnz	syndrome, .Laligned4_done
258	ldr	data1, [src1, #-4]
259	ldr	data2, [src2, #-4]
260	uadd8	syndrome, data1, const_m1
261	eor	syndrome, data1, data2
262	sel	syndrome, syndrome, const_m1
263	cmp	syndrome, #0
264	beq	.Lloop_aligned4
265
266.Laligned4_done:
267	strcmp_epilogue_aligned syndrome, data1, data2, 0
268
269.Lmutual_align4:
270	cfi_restore_state
271	/* Deal with mutual misalignment by aligning downwards and then
272	   masking off the unwanted loaded data to prevent a difference.  */
273	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
274	bic	src1, src1, #3
275	ldr	data1, [src1], #8
276	bic	src2, src2, #3
277	ldr	data2, [src2], #8
278
279	prepare_mask tmp1, tmp1
280	apply_mask data1, tmp1
281	apply_mask data2, tmp1
282	b	.Lstart_realigned4
283
284.Lmisaligned4:
285	ands	tmp1, src1, #3
286	beq	.Lsrc1_aligned
287	sub	src2, src2, tmp1
288	bic	src1, src1, #3
289	lsls	tmp1, tmp1, #31
290	ldr	data1, [src1], #4
291	beq	.Laligned_m2
292	bcs	.Laligned_m1
293
294#if STRCMP_PRECHECK == 0
295	ldrb	data2, [src2, #1]
296	uxtb	tmp1, data1, ror #BYTE1_OFFSET
297	subs	tmp1, tmp1, data2
298	bne	.Lmisaligned_exit
299	cbz	data2, .Lmisaligned_exit
300
301.Laligned_m2:
302	ldrb	data2, [src2, #2]
303	uxtb	tmp1, data1, ror #BYTE2_OFFSET
304	subs	tmp1, tmp1, data2
305	bne	.Lmisaligned_exit
306	cbz	data2, .Lmisaligned_exit
307
308.Laligned_m1:
309	ldrb	data2, [src2, #3]
310	uxtb	tmp1, data1, ror #BYTE3_OFFSET
311	subs	tmp1, tmp1, data2
312	bne	.Lmisaligned_exit
313	add	src2, src2, #4
314	cbnz	data2, .Lsrc1_aligned
315#else  /* STRCMP_PRECHECK */
316	/* If we've done the pre-check, then we don't need to check the
317	   first byte again here.  */
318	ldrb	data2, [src2, #2]
319	uxtb	tmp1, data1, ror #BYTE2_OFFSET
320	subs	tmp1, tmp1, data2
321	bne	.Lmisaligned_exit
322	cbz	data2, .Lmisaligned_exit
323
324.Laligned_m2:
325	ldrb	data2, [src2, #3]
326	uxtb	tmp1, data1, ror #BYTE3_OFFSET
327	subs	tmp1, tmp1, data2
328	bne	.Lmisaligned_exit
329	cbnz	data2, .Laligned_m1
330#endif
331
332.Lmisaligned_exit:
333	mov	result, tmp1
334	ldr	r4, [sp], #16
335	cfi_remember_state
336	cfi_def_cfa_offset (0)
337	cfi_restore (r4)
338	cfi_restore (r5)
339	cfi_restore (r6)
340	cfi_restore (r7)
341	bx	lr
342
343#if STRCMP_PRECHECK == 1
344.Laligned_m1:
345	add	src2, src2, #4
346#endif
347.Lsrc1_aligned:
348	cfi_restore_state
349	/* src1 is word aligned, but src2 has no common alignment
350	   with it.  */
351	ldr	data1, [src1], #4
352	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
353
354	bic	src2, src2, #3
355	ldr	data2, [src2], #4
356	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
357	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
358
359	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
360.Loverlap3:
361	bic	tmp1, data1, #MSB
362	uadd8	syndrome, data1, const_m1
363	eors	syndrome, tmp1, data2, S2LO #8
364	sel	syndrome, syndrome, const_m1
365	bne	4f
366	cbnz	syndrome, 5f
367	ldr	data2, [src2], #4
368	eor	tmp1, tmp1, data1
369	cmp	tmp1, data2, S2HI #24
370	bne	6f
371	ldr	data1, [src1], #4
372	b	.Loverlap3
3734:
374	S2LO	data2, data2, #8
375	b	.Lstrcmp_tail
376
3775:
378	bics	syndrome, syndrome, #MSB
379	bne	.Lstrcmp_done_equal
380
381	/* We can only get here if the MSB of data1 contains 0, so
382	   fast-path the exit.  */
383	ldrb	result, [src2]
384	ldrd	r4, r5, [sp], #16
385	cfi_remember_state
386	cfi_def_cfa_offset (0)
387	cfi_restore (r4)
388	cfi_restore (r5)
389	/* R6/7 Not used in this sequence.  */
390	cfi_restore (r6)
391	cfi_restore (r7)
392	neg	result, result
393	bx	lr
394
3956:
396	cfi_restore_state
397	S2LO	data1, data1, #24
398	and	data2, data2, #LSB
399	b	.Lstrcmp_tail
400
401	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
402.Loverlap2:
403	and	tmp1, data1, const_m1, S2LO #16
404	uadd8	syndrome, data1, const_m1
405	eors	syndrome, tmp1, data2, S2LO #16
406	sel	syndrome, syndrome, const_m1
407	bne	4f
408	cbnz	syndrome, 5f
409	ldr	data2, [src2], #4
410	eor	tmp1, tmp1, data1
411	cmp	tmp1, data2, S2HI #16
412	bne	6f
413	ldr	data1, [src1], #4
414	b	.Loverlap2
4154:
416	S2LO	data2, data2, #16
417	b	.Lstrcmp_tail
4185:
419	ands	syndrome, syndrome, const_m1, S2LO #16
420	bne	.Lstrcmp_done_equal
421
422	ldrh	data2, [src2]
423	S2LO	data1, data1, #16
424#ifdef __ARM_BIG_ENDIAN
425	lsl	data2, data2, #16
426#endif
427	b	.Lstrcmp_tail
428
4296:
430	S2LO	data1, data1, #16
431	and	data2, data2, const_m1, S2LO #16
432	b	.Lstrcmp_tail
433
434	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
435.Loverlap1:
436	and	tmp1, data1, #LSB
437	uadd8	syndrome, data1, const_m1
438	eors	syndrome, tmp1, data2, S2LO #24
439	sel	syndrome, syndrome, const_m1
440	bne	4f
441	cbnz	syndrome, 5f
442	ldr	data2, [src2], #4
443	eor	tmp1, tmp1, data1
444	cmp	tmp1, data2, S2HI #8
445	bne	6f
446	ldr	data1, [src1], #4
447	b	.Loverlap1
4484:
449	S2LO	data2, data2, #24
450	b	.Lstrcmp_tail
4515:
452	tst	syndrome, #LSB
453	bne	.Lstrcmp_done_equal
454	ldr	data2, [src2]
4556:
456	S2LO	data1, data1, #8
457	bic	data2, data2, #MSB
458	b	.Lstrcmp_tail
459
460.Lstrcmp_done_equal:
461	mov	result, #0
462	ldrd	r4, r5, [sp], #16
463	cfi_remember_state
464	cfi_def_cfa_offset (0)
465	cfi_restore (r4)
466	cfi_restore (r5)
467	/* R6/7 not used in this sequence.  */
468	cfi_restore (r6)
469	cfi_restore (r7)
470	bx	lr
471
472.Lstrcmp_tail:
473	cfi_restore_state
474#ifndef __ARM_BIG_ENDIAN
475	rev	data1, data1
476	rev	data2, data2
477	/* Now everything looks big-endian...  */
478#endif
479	uadd8	tmp1, data1, const_m1
480	eor	tmp1, data1, data2
481	sel	syndrome, tmp1, const_m1
482	clz	tmp1, syndrome
483	lsl	data1, data1, tmp1
484	lsl	data2, data2, tmp1
485	lsr	result, data1, #24
486	ldrd	r4, r5, [sp], #16
487	cfi_def_cfa_offset (0)
488	cfi_restore (r4)
489	cfi_restore (r5)
490	/* R6/7 not used in this sequence.  */
491	cfi_restore (r6)
492	cfi_restore (r7)
493	sub	result, result, data2, lsr #24
494	bx	lr
495END (strcmp)
496libc_hidden_builtin_def (strcmp)
497