1/* strcat(dest, src) -- Append SRC on the end of DEST.
2   Optimized for x86-64.
3   Copyright (C) 2002-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include "asm-syntax.h"
22
23/* Will be removed when new strcpy implementation gets merged.  */
24
25	.text
26ENTRY (strcat)
27	movq %rdi, %rcx		/* Dest. register. */
28	andl $7, %ecx		/* mask alignment bits */
29	movq %rdi, %rax		/* Duplicate destination pointer.  */
30	movq $0xfefefefefefefeff,%r8
31
32	/* First step: Find end of destination.  */
33	jz 4f			/* aligned => start loop */
34
35	neg %ecx		/* We need to align to 8 bytes.  */
36	addl $8,%ecx
37	/* Search the first bytes directly.  */
380:	cmpb $0x0,(%rax)	/* is byte NUL? */
39	je 2f			/* yes => start copy */
40	incq %rax		/* increment pointer */
41	decl %ecx
42	jnz 0b
43
44
45
46	/* Now the source is aligned.  Scan for NUL byte.  */
47	.p2align 4
484:
49	/* First unroll.  */
50	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
51	addq $8,%rax		/* adjust pointer for next word */
52	movq %r8, %rdx		/* magic value */
53	addq %rcx, %rdx		/* add the magic value to the word.  We get
54				   carry bits reported for each byte which
55				   is *not* 0 */
56	jnc 3f			/* highest byte is NUL => return pointer */
57	xorq %rcx, %rdx		/* (word+magic)^word */
58	orq %r8, %rdx		/* set all non-carry bits */
59	incq %rdx		/* add 1: if one carry bit was *not* set
60				   the addition will not result in 0.  */
61	jnz 3f			/* found NUL => return pointer */
62
63	/* Second unroll.  */
64	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
65	addq $8,%rax		/* adjust pointer for next word */
66	movq %r8, %rdx		/* magic value */
67	addq %rcx, %rdx		/* add the magic value to the word.  We get
68				   carry bits reported for each byte which
69				   is *not* 0 */
70	jnc 3f			/* highest byte is NUL => return pointer */
71	xorq %rcx, %rdx		/* (word+magic)^word */
72	orq %r8, %rdx		/* set all non-carry bits */
73	incq %rdx		/* add 1: if one carry bit was *not* set
74				   the addition will not result in 0.  */
75	jnz 3f			/* found NUL => return pointer */
76
77	/* Third unroll.  */
78	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
79	addq $8,%rax		/* adjust pointer for next word */
80	movq %r8, %rdx		/* magic value */
81	addq %rcx, %rdx		/* add the magic value to the word.  We get
82				   carry bits reported for each byte which
83				   is *not* 0 */
84	jnc 3f			/* highest byte is NUL => return pointer */
85	xorq %rcx, %rdx		/* (word+magic)^word */
86	orq %r8, %rdx		/* set all non-carry bits */
87	incq %rdx		/* add 1: if one carry bit was *not* set
88				   the addition will not result in 0.  */
89	jnz 3f			/* found NUL => return pointer */
90
91	/* Fourth unroll.  */
92	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
93	addq $8,%rax		/* adjust pointer for next word */
94	movq %r8, %rdx		/* magic value */
95	addq %rcx, %rdx		/* add the magic value to the word.  We get
96				   carry bits reported for each byte which
97				   is *not* 0 */
98	jnc 3f			/* highest byte is NUL => return pointer */
99	xorq %rcx, %rdx		/* (word+magic)^word */
100	orq %r8, %rdx		/* set all non-carry bits */
101	incq %rdx		/* add 1: if one carry bit was *not* set
102				   the addition will not result in 0.  */
103	jz 4b			/* no NUL found => continue loop */
104
105	.p2align 4		/* Align, it's a jump target.  */
1063:	subq $8,%rax		/* correct pointer increment.  */
107
108	testb %cl, %cl		/* is first byte NUL? */
109	jz 2f			/* yes => return */
110	incq %rax		/* increment pointer */
111
112	testb %ch, %ch		/* is second byte NUL? */
113	jz 2f			/* yes => return */
114	incq %rax		/* increment pointer */
115
116	testl $0x00ff0000, %ecx /* is third byte NUL? */
117	jz 2f			/* yes => return pointer */
118	incq %rax		/* increment pointer */
119
120	testl $0xff000000, %ecx /* is fourth byte NUL? */
121	jz 2f			/* yes => return pointer */
122	incq %rax		/* increment pointer */
123
124	shrq $32, %rcx		/* look at other half.  */
125
126	testb %cl, %cl		/* is first byte NUL? */
127	jz 2f			/* yes => return */
128	incq %rax		/* increment pointer */
129
130	testb %ch, %ch		/* is second byte NUL? */
131	jz 2f			/* yes => return */
132	incq %rax		/* increment pointer */
133
134	testl $0xff0000, %ecx	/* is third byte NUL? */
135	jz 2f			/* yes => return pointer */
136	incq %rax		/* increment pointer */
137
1382:
139	/* Second step: Copy source to destination.  */
140
141	movq	%rsi, %rcx	/* duplicate  */
142	andl	$7,%ecx		/* mask alignment bits */
143	movq	%rax, %rdx	/* move around */
144	jz	22f		/* aligned => start loop */
145
146	neg	%ecx		/* align to 8 bytes.  */
147	addl	$8, %ecx
148	/* Align the source pointer.  */
14921:
150	movb	(%rsi), %al	/* Fetch a byte */
151	testb	%al, %al	/* Is it NUL? */
152	movb	%al, (%rdx)	/* Store it */
153	jz	24f		/* If it was NUL, done! */
154	incq	%rsi
155	incq	%rdx
156	decl	%ecx
157	jnz	21b
158
159	/* Now the sources is aligned.  Unfortunatly we cannot force
160	   to have both source and destination aligned, so ignore the
161	   alignment of the destination.  */
162	.p2align 4
16322:
164	/* 1st unroll.  */
165	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
166	addq	$8, %rsi	/* Adjust pointer for next word.  */
167	movq	%rax, %r9	/* Save a copy for NUL finding.  */
168	addq	%r8, %r9	/* add the magic value to the word.  We get
169				   carry bits reported for each byte which
170				   is *not* 0 */
171	jnc	23f		/* highest byte is NUL => return pointer */
172	xorq	%rax, %r9	/* (word+magic)^word */
173	orq	%r8, %r9	/* set all non-carry bits */
174	incq	%r9		/* add 1: if one carry bit was *not* set
175				   the addition will not result in 0.  */
176
177	jnz	23f		/* found NUL => return pointer */
178
179	movq	%rax, (%rdx)	/* Write value to destination.  */
180	addq	$8, %rdx	/* Adjust pointer.  */
181
182	/* 2nd unroll.  */
183	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
184	addq	$8, %rsi	/* Adjust pointer for next word.  */
185	movq	%rax, %r9	/* Save a copy for NUL finding.  */
186	addq	%r8, %r9	/* add the magic value to the word.  We get
187				   carry bits reported for each byte which
188				   is *not* 0 */
189	jnc	23f		/* highest byte is NUL => return pointer */
190	xorq	%rax, %r9	/* (word+magic)^word */
191	orq	%r8, %r9	/* set all non-carry bits */
192	incq	%r9		/* add 1: if one carry bit was *not* set
193				   the addition will not result in 0.  */
194
195	jnz	23f		/* found NUL => return pointer */
196
197	movq	%rax, (%rdx)	/* Write value to destination.  */
198	addq	$8, %rdx	/* Adjust pointer.  */
199
200	/* 3rd unroll.  */
201	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
202	addq	$8, %rsi	/* Adjust pointer for next word.  */
203	movq	%rax, %r9	/* Save a copy for NUL finding.  */
204	addq	%r8, %r9	/* add the magic value to the word.  We get
205				   carry bits reported for each byte which
206				   is *not* 0 */
207	jnc	23f		/* highest byte is NUL => return pointer */
208	xorq	%rax, %r9	/* (word+magic)^word */
209	orq	%r8, %r9	/* set all non-carry bits */
210	incq	%r9		/* add 1: if one carry bit was *not* set
211				   the addition will not result in 0.  */
212
213	jnz	23f		/* found NUL => return pointer */
214
215	movq	%rax, (%rdx)	/* Write value to destination.  */
216	addq	$8, %rdx	/* Adjust pointer.  */
217
218	/* 4th unroll.  */
219	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
220	addq	$8, %rsi	/* Adjust pointer for next word.  */
221	movq	%rax, %r9	/* Save a copy for NUL finding.  */
222	addq	%r8, %r9	/* add the magic value to the word.  We get
223				   carry bits reported for each byte which
224				   is *not* 0 */
225	jnc	23f		/* highest byte is NUL => return pointer */
226	xorq	%rax, %r9	/* (word+magic)^word */
227	orq	%r8, %r9	/* set all non-carry bits */
228	incq	%r9		/* add 1: if one carry bit was *not* set
229				   the addition will not result in 0.  */
230
231	jnz	23f		/* found NUL => return pointer */
232
233	movq	%rax, (%rdx)	/* Write value to destination.  */
234	addq	$8, %rdx	/* Adjust pointer.  */
235	jmp	22b		/* Next iteration.  */
236
237	/* Do the last few bytes. %rax contains the value to write.
238	   The loop is unrolled twice.  */
239	.p2align 4
24023:
241	movb	%al, (%rdx)	/* 1st byte.  */
242	testb	%al, %al	/* Is it NUL.  */
243	jz	24f		/* yes, finish.  */
244	incq	%rdx		/* Increment destination.  */
245	movb	%ah, (%rdx)	/* 2nd byte.  */
246	testb	%ah, %ah	/* Is it NUL?.  */
247	jz	24f		/* yes, finish.  */
248	incq	%rdx		/* Increment destination.  */
249	shrq	$16, %rax	/* Shift...  */
250	jmp	23b		/* and look at next two bytes in %rax.  */
251
252
25324:
254	movq	%rdi, %rax	/* Source is return value.  */
255	retq
256END (strcat)
257libc_hidden_builtin_def (strcat)
258