1/* strlen with SSE2
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
20
21#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
22
23# ifndef USE_AS_STRCAT
24
25#  include <sysdep.h>
26#  define PARMS	4
27#  define STR	PARMS
28#  define RETURN	ret
29
30#  ifdef USE_AS_STRNLEN
31#   define LEN	PARMS + 8
32#   define CFI_PUSH(REG)	\
33	cfi_adjust_cfa_offset (4);	\
34	cfi_rel_offset (REG, 0)
35
36#   define CFI_POP(REG)	\
37	cfi_adjust_cfa_offset (-4);	\
38	cfi_restore (REG)
39
40#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
41#   define POP(REG)	popl	REG;	CFI_POP (REG)
42#   undef RETURN
43#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
44#  endif
45
46#  ifndef STRLEN
47#   define STRLEN	__strlen_sse2
48#  endif
49
50	atom_text_section
51ENTRY (STRLEN)
52	mov	STR(%esp), %edx
53#  ifdef USE_AS_STRNLEN
54	PUSH	(%edi)
55	movl	LEN(%esp), %edi
56	sub	$4, %edi
57	jbe	L(len_less4_prolog)
58#  endif
59# endif
60	xor	%eax, %eax
61	cmpb	$0, (%edx)
62	jz	L(exit_tail0)
63	cmpb	$0, 1(%edx)
64	jz	L(exit_tail1)
65	cmpb	$0, 2(%edx)
66	jz	L(exit_tail2)
67	cmpb	$0, 3(%edx)
68	jz	L(exit_tail3)
69
70# ifdef USE_AS_STRNLEN
71	sub	$4, %edi
72	jbe	L(len_less8_prolog)
73# endif
74
75	cmpb	$0, 4(%edx)
76	jz	L(exit_tail4)
77	cmpb	$0, 5(%edx)
78	jz	L(exit_tail5)
79	cmpb	$0, 6(%edx)
80	jz	L(exit_tail6)
81	cmpb	$0, 7(%edx)
82	jz	L(exit_tail7)
83
84# ifdef USE_AS_STRNLEN
85	sub	$4, %edi
86	jbe	L(len_less12_prolog)
87# endif
88
89	cmpb	$0, 8(%edx)
90	jz	L(exit_tail8)
91	cmpb	$0, 9(%edx)
92	jz	L(exit_tail9)
93	cmpb	$0, 10(%edx)
94	jz	L(exit_tail10)
95	cmpb	$0, 11(%edx)
96	jz	L(exit_tail11)
97
98# ifdef USE_AS_STRNLEN
99	sub	$4, %edi
100	jbe	L(len_less16_prolog)
101# endif
102
103	cmpb	$0, 12(%edx)
104	jz	L(exit_tail12)
105	cmpb	$0, 13(%edx)
106	jz	L(exit_tail13)
107	cmpb	$0, 14(%edx)
108	jz	L(exit_tail14)
109	cmpb	$0, 15(%edx)
110	jz	L(exit_tail15)
111
112	pxor	%xmm0, %xmm0
113	lea	16(%edx), %eax
114	mov	%eax, %ecx
115	and	$-16, %eax
116
117# ifdef USE_AS_STRNLEN
118	and	$15, %edx
119	add	%edx, %edi
120	sub	$64, %edi
121	jbe	L(len_less64)
122# endif
123
124	pcmpeqb	(%eax), %xmm0
125	pmovmskb %xmm0, %edx
126	pxor	%xmm1, %xmm1
127	test	%edx, %edx
128	lea	16(%eax), %eax
129	jnz	L(exit)
130
131	pcmpeqb	(%eax), %xmm1
132	pmovmskb %xmm1, %edx
133	pxor	%xmm2, %xmm2
134	test	%edx, %edx
135	lea	16(%eax), %eax
136	jnz	L(exit)
137
138	pcmpeqb	(%eax), %xmm2
139	pmovmskb %xmm2, %edx
140	pxor	%xmm3, %xmm3
141	test	%edx, %edx
142	lea	16(%eax), %eax
143	jnz	L(exit)
144
145	pcmpeqb	(%eax), %xmm3
146	pmovmskb %xmm3, %edx
147	test	%edx, %edx
148	lea	16(%eax), %eax
149	jnz	L(exit)
150
151# ifdef USE_AS_STRNLEN
152	sub	$64, %edi
153	jbe	L(len_less64)
154# endif
155
156	pcmpeqb	(%eax), %xmm0
157	pmovmskb %xmm0, %edx
158	test	%edx, %edx
159	lea	16(%eax), %eax
160	jnz	L(exit)
161
162	pcmpeqb	(%eax), %xmm1
163	pmovmskb %xmm1, %edx
164	test	%edx, %edx
165	lea	16(%eax), %eax
166	jnz	L(exit)
167
168	pcmpeqb	(%eax), %xmm2
169	pmovmskb %xmm2, %edx
170	test	%edx, %edx
171	lea	16(%eax), %eax
172	jnz	L(exit)
173
174	pcmpeqb	(%eax), %xmm3
175	pmovmskb %xmm3, %edx
176	test	%edx, %edx
177	lea	16(%eax), %eax
178	jnz	L(exit)
179
180# ifdef USE_AS_STRNLEN
181	sub	$64, %edi
182	jbe	L(len_less64)
183# endif
184
185	pcmpeqb	(%eax), %xmm0
186	pmovmskb %xmm0, %edx
187	test	%edx, %edx
188	lea	16(%eax), %eax
189	jnz	L(exit)
190
191	pcmpeqb	(%eax), %xmm1
192	pmovmskb %xmm1, %edx
193	test	%edx, %edx
194	lea	16(%eax), %eax
195	jnz	L(exit)
196
197	pcmpeqb	(%eax), %xmm2
198	pmovmskb %xmm2, %edx
199	test	%edx, %edx
200	lea	16(%eax), %eax
201	jnz	L(exit)
202
203	pcmpeqb	(%eax), %xmm3
204	pmovmskb %xmm3, %edx
205	test	%edx, %edx
206	lea	16(%eax), %eax
207	jnz	L(exit)
208
209# ifdef USE_AS_STRNLEN
210	sub	$64, %edi
211	jbe	L(len_less64)
212# endif
213
214	pcmpeqb	(%eax), %xmm0
215	pmovmskb %xmm0, %edx
216	test	%edx, %edx
217	lea	16(%eax), %eax
218	jnz	L(exit)
219
220	pcmpeqb	(%eax), %xmm1
221	pmovmskb %xmm1, %edx
222	test	%edx, %edx
223	lea	16(%eax), %eax
224	jnz	L(exit)
225
226	pcmpeqb	(%eax), %xmm2
227	pmovmskb %xmm2, %edx
228	test	%edx, %edx
229	lea	16(%eax), %eax
230	jnz	L(exit)
231
232	pcmpeqb	(%eax), %xmm3
233	pmovmskb %xmm3, %edx
234	test	%edx, %edx
235	lea	16(%eax), %eax
236	jnz	L(exit)
237
238# ifdef USE_AS_STRNLEN
239	mov	%eax, %edx
240	and	$63, %edx
241	add	%edx, %edi
242# endif
243
244	and	$-0x40, %eax
245
246	.p2align 4
247L(aligned_64_loop):
248# ifdef USE_AS_STRNLEN
249	sub	$64, %edi
250	jbe	L(len_less64)
251# endif
252	movaps	(%eax), %xmm0
253	movaps	16(%eax), %xmm1
254	movaps	32(%eax), %xmm2
255	movaps	48(%eax), %xmm6
256	pminub	%xmm1, %xmm0
257	pminub	%xmm6, %xmm2
258	pminub	%xmm0, %xmm2
259	pcmpeqb	%xmm3, %xmm2
260	pmovmskb %xmm2, %edx
261	test	%edx, %edx
262	lea	64(%eax), %eax
263	jz	L(aligned_64_loop)
264
265	pcmpeqb	-64(%eax), %xmm3
266	pmovmskb %xmm3, %edx
267	test	%edx, %edx
268	lea	48(%ecx), %ecx
269	jnz	L(exit)
270
271	pcmpeqb	%xmm1, %xmm3
272	pmovmskb %xmm3, %edx
273	test	%edx, %edx
274	lea	-16(%ecx), %ecx
275	jnz	L(exit)
276
277	pcmpeqb	-32(%eax), %xmm3
278	pmovmskb %xmm3, %edx
279	test	%edx, %edx
280	lea	-16(%ecx), %ecx
281	jnz	L(exit)
282
283	pcmpeqb	%xmm6, %xmm3
284	pmovmskb %xmm3, %edx
285	lea	-16(%ecx), %ecx
286L(exit):
287	sub	%ecx, %eax
288	test	%dl, %dl
289	jz	L(exit_high)
290
291	mov	%dl, %cl
292	and	$15, %cl
293	jz	L(exit_8)
294	test	$0x01, %dl
295	jnz	L(exit_tail0)
296	test	$0x02, %dl
297	jnz	L(exit_tail1)
298	test	$0x04, %dl
299	jnz	L(exit_tail2)
300	add	$3, %eax
301	RETURN
302
303	.p2align 4
304L(exit_8):
305	test	$0x10, %dl
306	jnz	L(exit_tail4)
307	test	$0x20, %dl
308	jnz	L(exit_tail5)
309	test	$0x40, %dl
310	jnz	L(exit_tail6)
311	add	$7, %eax
312	RETURN
313
314	.p2align 4
315L(exit_high):
316	mov	%dh, %ch
317	and	$15, %ch
318	jz	L(exit_high_8)
319	test	$0x01, %dh
320	jnz	L(exit_tail8)
321	test	$0x02, %dh
322	jnz	L(exit_tail9)
323	test	$0x04, %dh
324	jnz	L(exit_tail10)
325	add	$11, %eax
326	RETURN
327
328	.p2align 4
329L(exit_high_8):
330	test	$0x10, %dh
331	jnz	L(exit_tail12)
332	test	$0x20, %dh
333	jnz	L(exit_tail13)
334	test	$0x40, %dh
335	jnz	L(exit_tail14)
336	add	$15, %eax
337L(exit_tail0):
338	RETURN
339
340# ifdef USE_AS_STRNLEN
341
342	.p2align 4
343L(len_less64):
344	pxor	%xmm0, %xmm0
345	add	$64, %edi
346
347	pcmpeqb	(%eax), %xmm0
348	pmovmskb %xmm0, %edx
349	pxor	%xmm1, %xmm1
350	lea	16(%eax), %eax
351	test	%edx, %edx
352	jnz	L(strnlen_exit)
353
354	sub	$16, %edi
355	jbe	L(return_start_len)
356
357	pcmpeqb	(%eax), %xmm1
358	pmovmskb %xmm1, %edx
359	lea	16(%eax), %eax
360	test	%edx, %edx
361	jnz	L(strnlen_exit)
362
363	sub	$16, %edi
364	jbe	L(return_start_len)
365
366	pcmpeqb	(%eax), %xmm0
367	pmovmskb %xmm0, %edx
368	lea	16(%eax), %eax
369	test	%edx, %edx
370	jnz	L(strnlen_exit)
371
372	sub	$16, %edi
373	jbe	L(return_start_len)
374
375	pcmpeqb	(%eax), %xmm1
376	pmovmskb %xmm1, %edx
377	lea	16(%eax), %eax
378	test	%edx, %edx
379	jnz	L(strnlen_exit)
380
381	movl	LEN(%esp), %eax
382	RETURN
383
384	.p2align 4
385L(strnlen_exit):
386	sub	%ecx, %eax
387
388	test	%dl, %dl
389	jz	L(strnlen_exit_high)
390	mov	%dl, %cl
391	and	$15, %cl
392	jz	L(strnlen_exit_8)
393	test	$0x01, %dl
394	jnz	L(exit_tail0)
395	test	$0x02, %dl
396	jnz	L(strnlen_exit_tail1)
397	test	$0x04, %dl
398	jnz	L(strnlen_exit_tail2)
399	sub	$4, %edi
400	jb	L(return_start_len)
401	lea	3(%eax), %eax
402	RETURN
403
404	.p2align 4
405L(strnlen_exit_8):
406	test	$0x10, %dl
407	jnz	L(strnlen_exit_tail4)
408	test	$0x20, %dl
409	jnz	L(strnlen_exit_tail5)
410	test	$0x40, %dl
411	jnz	L(strnlen_exit_tail6)
412	sub	$8, %edi
413	jb	L(return_start_len)
414	lea	7(%eax), %eax
415	RETURN
416
417	.p2align 4
418L(strnlen_exit_high):
419	mov	%dh, %ch
420	and	$15, %ch
421	jz	L(strnlen_exit_high_8)
422	test	$0x01, %dh
423	jnz	L(strnlen_exit_tail8)
424	test	$0x02, %dh
425	jnz	L(strnlen_exit_tail9)
426	test	$0x04, %dh
427	jnz	L(strnlen_exit_tail10)
428	sub	$12, %edi
429	jb	L(return_start_len)
430	lea	11(%eax), %eax
431	RETURN
432
433	.p2align 4
434L(strnlen_exit_high_8):
435	test	$0x10, %dh
436	jnz	L(strnlen_exit_tail12)
437	test	$0x20, %dh
438	jnz	L(strnlen_exit_tail13)
439	test	$0x40, %dh
440	jnz	L(strnlen_exit_tail14)
441	sub	$16, %edi
442	jb	L(return_start_len)
443	lea	15(%eax), %eax
444	RETURN
445
446	.p2align 4
447L(strnlen_exit_tail1):
448	sub	$2, %edi
449	jb	L(return_start_len)
450	lea	1(%eax), %eax
451	RETURN
452
453	.p2align 4
454L(strnlen_exit_tail2):
455	sub	$3, %edi
456	jb	L(return_start_len)
457	lea	2(%eax), %eax
458	RETURN
459
460	.p2align 4
461L(strnlen_exit_tail4):
462	sub	$5, %edi
463	jb	L(return_start_len)
464	lea	4(%eax), %eax
465	RETURN
466
467	.p2align 4
468L(strnlen_exit_tail5):
469	sub	$6, %edi
470	jb	L(return_start_len)
471	lea	5(%eax), %eax
472	RETURN
473
474	.p2align 4
475L(strnlen_exit_tail6):
476	sub	$7, %edi
477	jb	L(return_start_len)
478	lea	6(%eax), %eax
479	RETURN
480
481	.p2align 4
482L(strnlen_exit_tail8):
483	sub	$9, %edi
484	jb	L(return_start_len)
485	lea	8(%eax), %eax
486	RETURN
487
488	.p2align 4
489L(strnlen_exit_tail9):
490	sub	$10, %edi
491	jb	L(return_start_len)
492	lea	9(%eax), %eax
493	RETURN
494
495	.p2align 4
496L(strnlen_exit_tail10):
497	sub	$11, %edi
498	jb	L(return_start_len)
499	lea	10(%eax), %eax
500	RETURN
501
502	.p2align 4
503L(strnlen_exit_tail12):
504	sub	$13, %edi
505	jb	L(return_start_len)
506	lea	12(%eax), %eax
507	RETURN
508
509	.p2align 4
510L(strnlen_exit_tail13):
511	sub	$14, %edi
512	jb	L(return_start_len)
513	lea	13(%eax), %eax
514	RETURN
515
516	.p2align 4
517L(strnlen_exit_tail14):
518	sub	$15, %edi
519	jb	L(return_start_len)
520	lea	14(%eax), %eax
521	RETURN
522
523	.p2align 4
524L(return_start_len):
525	movl	LEN(%esp), %eax
526	RETURN
527
528/* for prolog only */
529
530	.p2align 4
531L(len_less4_prolog):
532	xor	%eax, %eax
533
534	add	$4, %edi
535	jz	L(exit_tail0)
536
537	cmpb	$0, (%edx)
538	jz	L(exit_tail0)
539	cmp	$1, %edi
540	je	L(exit_tail1)
541
542	cmpb	$0, 1(%edx)
543	jz	L(exit_tail1)
544	cmp	$2, %edi
545	je	L(exit_tail2)
546
547	cmpb	$0, 2(%edx)
548	jz	L(exit_tail2)
549	cmp	$3, %edi
550	je	L(exit_tail3)
551
552	cmpb	$0, 3(%edx)
553	jz	L(exit_tail3)
554	mov	$4, %eax
555	RETURN
556
557	.p2align 4
558L(len_less8_prolog):
559	add	$4, %edi
560
561	cmpb	$0, 4(%edx)
562	jz	L(exit_tail4)
563	cmp	$1, %edi
564	je	L(exit_tail5)
565
566	cmpb	$0, 5(%edx)
567	jz	L(exit_tail5)
568	cmp	$2, %edi
569	je	L(exit_tail6)
570
571	cmpb	$0, 6(%edx)
572	jz	L(exit_tail6)
573	cmp	$3, %edi
574	je	L(exit_tail7)
575
576	cmpb	$0, 7(%edx)
577	jz	L(exit_tail7)
578	mov	$8, %eax
579	RETURN
580
581
582	.p2align 4
583L(len_less12_prolog):
584	add	$4, %edi
585
586	cmpb	$0, 8(%edx)
587	jz	L(exit_tail8)
588	cmp	$1, %edi
589	je	L(exit_tail9)
590
591	cmpb	$0, 9(%edx)
592	jz	L(exit_tail9)
593	cmp	$2, %edi
594	je	L(exit_tail10)
595
596	cmpb	$0, 10(%edx)
597	jz	L(exit_tail10)
598	cmp	$3, %edi
599	je	L(exit_tail11)
600
601	cmpb	$0, 11(%edx)
602	jz	L(exit_tail11)
603	mov	$12, %eax
604	RETURN
605
606	.p2align 4
607L(len_less16_prolog):
608	add	$4, %edi
609
610	cmpb	$0, 12(%edx)
611	jz	L(exit_tail12)
612	cmp	$1, %edi
613	je	L(exit_tail13)
614
615	cmpb	$0, 13(%edx)
616	jz	L(exit_tail13)
617	cmp	$2, %edi
618	je	L(exit_tail14)
619
620	cmpb	$0, 14(%edx)
621	jz	L(exit_tail14)
622	cmp	$3, %edi
623	je	L(exit_tail15)
624
625	cmpb	$0, 15(%edx)
626	jz	L(exit_tail15)
627	mov	$16, %eax
628	RETURN
629# endif
630
631	.p2align 4
632L(exit_tail1):
633	add	$1, %eax
634	RETURN
635
636L(exit_tail2):
637	add	$2, %eax
638	RETURN
639
640L(exit_tail3):
641	add	$3, %eax
642	RETURN
643
644L(exit_tail4):
645	add	$4, %eax
646	RETURN
647
648L(exit_tail5):
649	add	$5, %eax
650	RETURN
651
652L(exit_tail6):
653	add	$6, %eax
654	RETURN
655
656L(exit_tail7):
657	add	$7, %eax
658	RETURN
659
660L(exit_tail8):
661	add	$8, %eax
662	RETURN
663
664L(exit_tail9):
665	add	$9, %eax
666	RETURN
667
668L(exit_tail10):
669	add	$10, %eax
670	RETURN
671
672L(exit_tail11):
673	add	$11, %eax
674	RETURN
675
676L(exit_tail12):
677	add	$12, %eax
678	RETURN
679
680L(exit_tail13):
681	add	$13, %eax
682	RETURN
683
684L(exit_tail14):
685	add	$14, %eax
686	RETURN
687
688L(exit_tail15):
689	add	$15, %eax
690# ifndef USE_AS_STRCAT
691	RETURN
692END (STRLEN)
693# endif
694#endif
695