1/* Optimized strncmp implementation for PowerPC64/POWER8.
2   Copyright (C) 2015-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef STRNCMP
22# define STRNCMP strncmp
23#endif
24
25/* Implements the function
26
27   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
28
29   The implementation uses unaligned doubleword access to avoid specialized
30   code paths depending of data alignment.  Although recent powerpc64 uses
31   64K as default, the page cross handling assumes minimum page size of
32   4k.  */
33
34	.machine  power8
35ENTRY_TOCLESS (STRNCMP, 4)
36	/* Check if size is 0.  */
37	mr.	r10,r5
38	beq	cr0,L(ret0)
39
40	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
41	   the code:
42
43	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
44
45	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
46	rldicl	r8,r3,0,52
47	cmpldi	cr7,r8,4096-16
48	bgt	cr7,L(pagecross)
49	rldicl	r9,r4,0,52
50	cmpldi	cr7,r9,4096-16
51	bgt	cr7,L(pagecross)
52
53	/* For short string up to 16 bytes, load both s1 and s2 using
54	   unaligned dwords and compare.  */
55	ld	r7,0(r3)
56	ld	r9,0(r4)
57	li	r8,0
58	cmpb	r8,r7,r8
59	cmpb	r6,r7,r9
60	orc.	r8,r8,r6
61	bne	cr0,L(different1)
62
63	/* If the string compared are equal, but size is less or equal
64	   to 8, return 0.  */
65	cmpldi	cr7,r10,8
66	li	r9,0
67	ble	cr7,L(ret1)
68	addi	r5,r10,-8
69
70	ld	r7,8(r3)
71	ld	r9,8(r4)
72	cmpb	r8,r7,r8
73	cmpb	r6,r7,r9
74	orc.	r8,r8,r6
75	bne	cr0,L(different0)
76
77	cmpldi	cr7,r5,8
78	mr	r9,r8
79	ble	cr7,L(ret1)
80
81	/* Update pointers and size.  */
82	addi	r10,r10,-16
83	addi	r3,r3,16
84	addi	r4,r4,16
85
86	/* Now it has checked for first 16 bytes, align source1 to doubleword
87	   and adjust source2 address.  */
88L(align_8b):
89	rldicl	r5,r3,0,61
90	rldicr	r3,r3,0,60
91	subf	r4,r5,r4
92	add	r10,r10,r5
93
94	/* At this point, source1 alignment is 0 and source2 alignment is
95	   between 0 and 7.  Check is source2 alignment is 0, meaning both
96	   sources have the same alignment.  */
97	andi.	r8,r4,0x7
98	beq	cr0,L(loop_eq_align_0)
99
100	li	r5,0
101	b	L(loop_ne_align_1)
102
103	/* If source2 is unaligned to doubleword, the code needs to check
104	   on each interation if the unaligned doubleword access will cross
105	   a 4k page boundary.  */
106	.align 4
107L(loop_ne_align_0):
108	ld	r7,0(r3)
109	ld	r9,0(r4)
110	cmpb	r8,r7,r5
111	cmpb	r6,r7,r9
112	orc.	r8,r8,r6
113	bne	cr0,L(different1)
114
115	cmpldi	cr7,r10,8
116	ble	cr7,L(ret0)
117	addi	r10,r10,-8
118	addi	r3,r3,8
119	addi	r4,r4,8
120L(loop_ne_align_1):
121	rldicl	r9,r4,0,52
122	cmpldi	r7,r9,4088
123	ble	cr7,L(loop_ne_align_0)
124	cmpdi	cr7,r10,0
125	beq	cr7,L(ret0)
126
127	lbz	r9,0(r3)
128	lbz	r8,0(r4)
129	cmplw	cr7,r9,r8
130	bne	cr7,L(byte_ne_4)
131	cmpdi	cr7,r9,0
132	beq	cr7,L(size_reached_0)
133
134	li	r9,r7
135	addi	r8,r3,1
136	mtctr	r9
137	addi	r4,r4,1
138	addi	r10,r10,-1
139	addi	r3,r3,8
140
141	/* The unaligned read of source2 will cross a 4K page boundary,
142	   and the different byte or NULL maybe be in the remaining page
143	   bytes.  Since it can not use the unaligned load the algorithm
144	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
145	.align 4
146L(loop_ne_align_byte):
147	cmpdi	cr7,r10,0
148	addi	r10,r10,-1
149	beq	cr7,L(ret0)
150	lbz	r9,0(r8)
151	lbz	r7,0(r4)
152	addi	r8,r8,1
153	addi	r4,r4,1
154	cmplw	cr7,r9,r7
155	cmpdi	cr5,r9,0
156	bne	cr7,L(size_reached_2)
157	beq	cr5,L(size_reached_0)
158	bdnz	L(loop_ne_align_byte)
159
160	cmpdi	cr7,r10,0
161	bne+	cr7,L(loop_ne_align_0)
162
163	.align 4
164L(ret0):
165	li	r9,0
166L(ret1):
167	mr	r3,r9
168	blr
169
170	/* The code now check if r8 and r10 are different by issuing a
171	   cmpb and shift the result based on its output:
172
173	#ifdef __LITTLE_ENDIAN__
174	  leadzero = (__builtin_ffsl (z1) - 1);
175	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
176	  r1 = (r1 >> leadzero) & 0xFFUL;
177	  r2 = (r2 >> leadzero) & 0xFFUL;
178	#else
179	  leadzero = __builtin_clzl (z1);
180	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
181	  r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
182	  r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
183	#endif
184	  return r1 - r2;  */
185
186	.align 4
187L(different0):
188	mr	r10,r5
189#ifdef __LITTLE_ENDIAN__
190L(different1):
191        neg	r11,r8
192        sldi	r10,r10,3
193        and	r8,r11,r8
194        addi	r10,r10,-8
195        cntlzd	r8,r8
196        subfic	r8,r8,63
197        extsw 	r8,r8
198        cmpld	cr7,r8,r10
199        ble	cr7,L(different2)
200        mr	r8,r10
201L(different2):
202        extsw	r8,r8
203#else
204L(different1):
205	addi	r10,r10,-1
206	cntlzd	r8,r8
207	sldi	r10,r10,3
208	cmpld	cr7,r8,r10
209	blt	cr7,L(different2)
210	mr	r8,r10
211L(different2):
212	subfic	r8,r8,56
213#endif
214	srd	r7,r7,r8
215	srd	r9,r9,r8
216	rldicl	r3,r7,0,56
217	rldicl	r9,r9,0,56
218	subf	r9,r9,3
219	extsw	r9,r9
220	mr	r3,r9
221	blr
222
223	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
224	   a simple byte a byte comparison until the page alignment for s1
225	   is reached.  */
226	.align 4
227L(pagecross):
228	lbz	r7,0(r3)
229	lbz	r9,0(r4)
230	subfic	r8,r8,4095
231	cmplw	cr7,r9,r7
232	bne	cr7,L(byte_ne_3)
233	cmpdi	cr7,r9,0
234	beq	cr7,L(byte_ne_0)
235	addi	r10,r10,-1
236	subf	r7,r8,r10
237	subf	r9,r7,r10
238	addi	r9,r9,1
239	mtctr	r9
240	b	L(pagecross_loop1)
241
242	.align 4
243L(pagecross_loop0):
244	beq	cr7,L(ret0)
245	lbz	r9,0(r3)
246	lbz	r8,0(r4)
247	addi	r10,r10,-1
248	cmplw	cr7,r9,r8
249	cmpdi	cr5,r9,0
250	bne	r7,L(byte_ne_2)
251	beq	r5,L(byte_ne_0)
252L(pagecross_loop1):
253	cmpdi	cr7,r10,0
254	addi	r3,r3,1
255	addi	r4,r4,1
256	bdnz	L(pagecross_loop0)
257	cmpdi	cr7,r7,0
258	li	r9,0
259	bne+	cr7,L(align_8b)
260	b	L(ret1)
261
262	/* If both source1 and source2 are doubleword aligned, there is no
263	   need for page boundary cross checks.  */
264	.align 4
265L(loop_eq_align_0):
266	ld	r7,0(r3)
267	ld	r9,0(r4)
268	cmpb	r8,r7,r8
269	cmpb	r6,r7,r9
270	orc.	r8,r8,r6
271	bne	cr0,L(different1)
272
273	cmpldi	cr7,r10,8
274	ble	cr7,L(ret0)
275	addi	r9,r10,-9
276
277	li	r5,0
278	srdi	r9,r9,3
279	addi	r9,r9,1
280	mtctr	r9
281	b	L(loop_eq_align_2)
282
283	.align 4
284L(loop_eq_align_1):
285	bdz	L(ret0)
286L(loop_eq_align_2):
287	ldu	r7,8(r3)
288	addi	r10,r10,-8
289	ldu	r9,8(r4)
290	cmpb	r8,r7,r5
291	cmpb	r6,r7,r9
292	orc.	r8,r8,r6
293	beq	cr0,L(loop_eq_align_1)
294	b	L(different1)
295
296	.align 4
297L(byte_ne_0):
298	li	r7,0
299L(byte_ne_1):
300	subf	r9,r9,r7
301	extsw	r9,r9
302	b	L(ret1)
303
304	.align 4
305L(byte_ne_2):
306	extsw	r7,r9
307	mr	r9,r8
308	b	L(byte_ne_1)
309L(size_reached_0):
310	li	r10,0
311L(size_reached_1):
312	subf	r9,r9,r10
313	extsw	r9,r9
314	b	L(ret1)
315L(size_reached_2):
316	extsw	r10,r9
317	mr	r9,r7
318	b	L(size_reached_1)
319L(byte_ne_3):
320	extsw	r7,r7
321	b	L(byte_ne_1)
322L(byte_ne_4):
323	extsw	r10,r9
324	mr	r9,r8
325	b	L(size_reached_1)
326END(STRNCMP)
327libc_hidden_builtin_def(strncmp)
328