1/* Pentium optimized __mpn_lshift --
2   Copyright (C) 1992-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include "sysdep.h"
20#include "asm-syntax.h"
21
22#define PARMS	4+16		/* space for 4 saved regs */
23#define RES	PARMS
24#define S	RES+4
25#define SIZE	S+4
26#define CNT	SIZE+4
27
28	.text
29ENTRY (__mpn_lshift)
30
31	pushl	%edi
32	cfi_adjust_cfa_offset (4)
33	pushl	%esi
34	cfi_adjust_cfa_offset (4)
35	pushl	%ebp
36	cfi_adjust_cfa_offset (4)
37	cfi_rel_offset (ebp, 0)
38	pushl	%ebx
39	cfi_adjust_cfa_offset (4)
40
41	movl	RES(%esp),%edi
42	cfi_rel_offset (edi, 12)
43	movl	S(%esp),%esi
44	cfi_rel_offset (esi, 8)
45	movl	SIZE(%esp),%ebx
46	cfi_rel_offset (ebx, 0)
47	movl	CNT(%esp),%ecx
48
49/* We can use faster code for shift-by-1 under certain conditions.  */
50	cmp	$1,%ecx
51	jne	L(normal)
52	leal	4(%esi),%eax
53	cmpl	%edi,%eax
54	jnc	L(special)		/* jump if s_ptr + 1 >= res_ptr */
55	leal	(%esi,%ebx,4),%eax
56	cmpl	%eax,%edi
57	jnc	L(special)		/* jump if res_ptr >= s_ptr + size */
58
59L(normal):
60	leal	-4(%edi,%ebx,4),%edi
61	leal	-4(%esi,%ebx,4),%esi
62
63	movl	(%esi),%edx
64	subl	$4,%esi
65	xorl	%eax,%eax
66	shldl	%cl,%edx,%eax		/* compute carry limb */
67	pushl	%eax			/* push carry limb onto stack */
68	cfi_adjust_cfa_offset (4)
69
70	decl	%ebx
71	pushl	%ebx
72	cfi_adjust_cfa_offset (4)
73	shrl	$3,%ebx
74	jz	L(end)
75
76	movl	(%edi),%eax		/* fetch destination cache line */
77
78	ALIGN	(2)
79L(oop):	movl	-28(%edi),%eax		/* fetch destination cache line */
80	movl	%edx,%ebp
81
82	movl	(%esi),%eax
83	movl	-4(%esi),%edx
84	shldl	%cl,%eax,%ebp
85	shldl	%cl,%edx,%eax
86	movl	%ebp,(%edi)
87	movl	%eax,-4(%edi)
88
89	movl	-8(%esi),%ebp
90	movl	-12(%esi),%eax
91	shldl	%cl,%ebp,%edx
92	shldl	%cl,%eax,%ebp
93	movl	%edx,-8(%edi)
94	movl	%ebp,-12(%edi)
95
96	movl	-16(%esi),%edx
97	movl	-20(%esi),%ebp
98	shldl	%cl,%edx,%eax
99	shldl	%cl,%ebp,%edx
100	movl	%eax,-16(%edi)
101	movl	%edx,-20(%edi)
102
103	movl	-24(%esi),%eax
104	movl	-28(%esi),%edx
105	shldl	%cl,%eax,%ebp
106	shldl	%cl,%edx,%eax
107	movl	%ebp,-24(%edi)
108	movl	%eax,-28(%edi)
109
110	subl	$32,%esi
111	subl	$32,%edi
112	decl	%ebx
113	jnz	L(oop)
114
115L(end):	popl	%ebx
116	cfi_adjust_cfa_offset (-4)
117	andl	$7,%ebx
118	jz	L(end2)
119L(oop2):
120	movl	(%esi),%eax
121	shldl	%cl,%eax,%edx
122	movl	%edx,(%edi)
123	movl	%eax,%edx
124	subl	$4,%esi
125	subl	$4,%edi
126	decl	%ebx
127	jnz	L(oop2)
128
129L(end2):
130	shll	%cl,%edx		/* compute least significant limb */
131	movl	%edx,(%edi)		/* store it */
132
133	popl	%eax			/* pop carry limb */
134	cfi_adjust_cfa_offset (-4)
135
136	popl	%ebx
137	cfi_adjust_cfa_offset (-4)
138	cfi_restore (ebx)
139	popl	%ebp
140	cfi_adjust_cfa_offset (-4)
141	cfi_restore (ebp)
142	popl	%esi
143	cfi_adjust_cfa_offset (-4)
144	cfi_restore (esi)
145	popl	%edi
146	cfi_adjust_cfa_offset (-4)
147	cfi_restore (edi)
148
149	ret
150
151/* We loop from least significant end of the arrays, which is only
152   permissible if the source and destination don't overlap, since the
153   function is documented to work for overlapping source and destination.
154*/
155
156	cfi_adjust_cfa_offset (16)
157	cfi_rel_offset (edi, 12)
158	cfi_rel_offset (esi, 8)
159	cfi_rel_offset (ebp, 4)
160	cfi_rel_offset (ebx, 0)
161L(special):
162	movl	(%esi),%edx
163	addl	$4,%esi
164
165	decl	%ebx
166	pushl	%ebx
167	cfi_adjust_cfa_offset (4)
168	shrl	$3,%ebx
169
170	addl	%edx,%edx
171	incl	%ebx
172	decl	%ebx
173	jz	L(Lend)
174
175	movl	(%edi),%eax		/* fetch destination cache line */
176
177	ALIGN	(2)
178L(Loop):
179	movl	28(%edi),%eax		/* fetch destination cache line */
180	movl	%edx,%ebp
181
182	movl	(%esi),%eax
183	movl	4(%esi),%edx
184	adcl	%eax,%eax
185	movl	%ebp,(%edi)
186	adcl	%edx,%edx
187	movl	%eax,4(%edi)
188
189	movl	8(%esi),%ebp
190	movl	12(%esi),%eax
191	adcl	%ebp,%ebp
192	movl	%edx,8(%edi)
193	adcl	%eax,%eax
194	movl	%ebp,12(%edi)
195
196	movl	16(%esi),%edx
197	movl	20(%esi),%ebp
198	adcl	%edx,%edx
199	movl	%eax,16(%edi)
200	adcl	%ebp,%ebp
201	movl	%edx,20(%edi)
202
203	movl	24(%esi),%eax
204	movl	28(%esi),%edx
205	adcl	%eax,%eax
206	movl	%ebp,24(%edi)
207	adcl	%edx,%edx
208	movl	%eax,28(%edi)
209
210	leal	32(%esi),%esi		/* use leal not to clobber carry */
211	leal	32(%edi),%edi
212	decl	%ebx
213	jnz	L(Loop)
214
215L(Lend):
216	popl	%ebx
217	cfi_adjust_cfa_offset (-4)
218	sbbl	%eax,%eax		/* save carry in %eax */
219	andl	$7,%ebx
220	jz	L(Lend2)
221	addl	%eax,%eax		/* restore carry from eax */
222L(Loop2):
223	movl	%edx,%ebp
224	movl	(%esi),%edx
225	adcl	%edx,%edx
226	movl	%ebp,(%edi)
227
228	leal	4(%esi),%esi		/* use leal not to clobber carry */
229	leal	4(%edi),%edi
230	decl	%ebx
231	jnz	L(Loop2)
232
233	jmp	L(L1)
234L(Lend2):
235	addl	%eax,%eax		/* restore carry from eax */
236L(L1):	movl	%edx,(%edi)		/* store last limb */
237
238	sbbl	%eax,%eax
239	negl	%eax
240
241	popl	%ebx
242	cfi_adjust_cfa_offset (-4)
243	cfi_restore (ebx)
244	popl	%ebp
245	cfi_adjust_cfa_offset (-4)
246	cfi_restore (ebp)
247	popl	%esi
248	cfi_adjust_cfa_offset (-4)
249	cfi_restore (esi)
250	popl	%edi
251	cfi_adjust_cfa_offset (-4)
252	cfi_restore (edi)
253
254	ret
255END (__mpn_lshift)
256