1/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "hardware/regs/sio.h"
8#include "hardware/regs/addressmap.h"
9
10.syntax unified
11.cpu cortex-m0plus
12.thumb
13
14#include "pico/asm_helper.S"
15
16#ifndef PICO_DIVIDER_CALL_IDIV0
17#define PICO_DIVIDER_CALL_IDIV0 1
18#endif
19
20#ifndef PICO_DIVIDER_CALL_LDIV0
21#define PICO_DIVIDER_CALL_LDIV0 1
22#endif
23
24.macro div_section name
25#if PICO_DIVIDER_IN_RAM
26.section RAM_SECTION_NAME(\name), "ax"
27#else
28.section SECTION_NAME(\name), "ax"
29#endif
30.endm
31
32#if SIO_DIV_CSR_READY_LSB == 0
33.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
34#else
35need to change SHIFT above
36#endif
37#if SIO_DIV_CSR_DIRTY_LSB == 1
38.equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
39#else
40need to change SHIFT above
41#endif
42
43@ wait 8-n cycles for the hardware divider
44.macro wait_div n
45.rept (8-\n) / 2
46  b 9f
479:
48.endr
49.if (8-\n) % 2
50 nop
51.endif
52.endm
53
54
55#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
56#error register layout has changed - we rely on this order to make sure we save/restore in the right order
57#endif
58
59# SIO_BASE ptr in r2
60.macro save_div_state_and_lr
61    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
62    # wait for results as we can't save signed-ness of operation
631:
64    lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
65    bcc 1b
66    push {r4, r5, r6, r7, lr}
67    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
68    ldr r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
69    ldr r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
70    ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
71    ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
72.endm
73
74.macro restore_div_state_and_return
75    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
76    //
77    // it is worth considering what happens if we are interrupted
78    //
79    // after writing r4: we are DIRTY and !READY
80    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
81    //        saved/restored correctly and we'll restore the rest ourselves
82    // after writing r4, r5: we are DIRTY and !READY
83    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
84    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
85    // after writing r4, r5, r6: we are DIRTY and READY
86    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
87    //        and we'll restore the remainder after the fact
88
89    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
90    // and so 4 reads is cheaper (and we don't have to adjust r2)
91    str r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
92    str r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
93    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
94    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
95    pop {r4, r5, r6, r7, pc}
96.endm
97
98.macro save_div_state_and_lr_64
99    push {r4, r5, r6, r7, lr}
100    ldr r6, =SIO_BASE
1011:
102    ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
103    # wait for results as we can't save signed-ness of operation
104    lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
105    bcc 1b
106    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
107    ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
108    ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
109    ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
110    ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
111.endm
112
113.macro restore_div_state_and_return_64
114    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
115    //
116    // it is worth considering what happens if we are interrupted
117    //
118    // after writing r4: we are DIRTY and !READY
119    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
120    //        saved/restored correctly and we'll restore the rest ourselves
121    // after writing r4, r5: we are DIRTY and !READY
122    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
123    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
124    // after writing r4, r5, r6: we are DIRTY and READY
125    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
126    //        and we'll restore the remainder after the fact
127
128    mov ip, r2
129    ldr r2, =SIO_BASE
130    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
131    // and so 4 reads is cheaper (and we don't have to adjust r2)
132    str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
133    str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
134    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
135    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
136    mov r2, ip
137    pop {r4, r5, r6, r7, pc}
138.endm
139
140
141// since idiv and idivmod only differ by a cycle, we'll make them the same!
142div_section WRAPPER_FUNC_NAME(__aeabi_idiv)
143.align 2
144wrapper_func __aeabi_idiv
145wrapper_func __aeabi_idivmod
146regular_func div_s32s32
147regular_func divmod_s32s32
148    ldr r2, =(SIO_BASE)
149    # to support IRQ usage we must save/restore
150    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
151    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
152    bcs divmod_s32s32_savestate
153regular_func divmod_s32s32_unsafe
154    str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
155    str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET]
156    cmp r1, #0
157    beq 1f
158    wait_div 2
159    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
160    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
161    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
162    bx lr
1631:
164    push {r2, lr}
165    movs r1, #0x80
166    lsls r1, #24
167    asrs r2, r0, #31
168    eors r1, r2
169    cmp r0, #0
170    beq 1f
171    mvns r0, r1
1721:
173#if PICO_DIVIDER_CALL_IDIV0
174    bl __aeabi_idiv0
175#endif
176    movs r1, #0 // remainder 0
177    // need to restore saved r2 as it hold SIO ptr
178    pop {r2, pc}
179.align 2
180regular_func divmod_s32s32_savestate
181    save_div_state_and_lr
182    bl divmod_s32s32_unsafe
183    restore_div_state_and_return
184
185// since uidiv and uidivmod only differ by a cycle, we'll make them the same!
186div_section WRAPPER_FUNC_NAME(__aeabi_uidiv)
187regular_func div_u32u32
188regular_func divmod_u32u32
189wrapper_func __aeabi_uidiv
190wrapper_func __aeabi_uidivmod
191    ldr r2, =(SIO_BASE)
192    # to support IRQ usage we must save/restore
193    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
194    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
195    bcs divmod_u32u32_savestate
196regular_func divmod_u32u32_unsafe
197    str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
198    str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET]
199    cmp r1, #0
200    beq 1f
201    wait_div 2
202    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
203    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
204    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
205    bx lr
2061:
207    push {r2, lr}
208    cmp r0, #0
209    beq 1f
210    movs r0, #0
211    mvns r0, r0
2121:
213#if PICO_DIVIDER_CALL_IDIV0
214    bl __aeabi_idiv0
215#endif
216    movs r1, #0 // remainder 0
217    // need to restore saved r2 as it hold SIO ptr
218    pop {r2, pc}
219.align 2
220regular_func divmod_u32u32_savestate
221    save_div_state_and_lr
222    bl divmod_u32u32_unsafe
223    restore_div_state_and_return
224
225div_section WRAPPER_FUNC_NAME(__aeabi_ldiv)
226
227.align 2
228wrapper_func __aeabi_ldivmod
229regular_func div_s64s64
230regular_func divmod_s64s64
231    mov ip, r2
232    ldr r2, =(SIO_BASE)
233    # to support IRQ usage we must save/restore
234    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
235    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
236    mov r2, ip
237    bcs divmod_s64s64_savestate
238    b divmod_s64s64_unsafe
239.align 2
240divmod_s64s64_savestate:
241    save_div_state_and_lr_64
242    bl divmod_s64s64_unsafe
243    restore_div_state_and_return_64
244
245.align 2
246wrapper_func __aeabi_uldivmod
247regular_func div_u64u64
248regular_func divmod_u64u64
249    mov ip, r2
250    ldr r2, =(SIO_BASE)
251    # to support IRQ usage we must save/restore
252    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
253    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
254    mov r2, ip
255    bcs divmod_u64u64_savestate
256    b divmod_u64u64_unsafe
257.align 2
258regular_func divmod_u64u64_savestate
259    save_div_state_and_lr_64
260    bl divmod_u64u64_unsafe
261    restore_div_state_and_return_64
262.macro dneg lo,hi
263 mvns \hi,\hi
264 rsbs \lo,#0
265 bne l\@_1
266 adds \hi,#1
267l\@_1:
268.endm
269
270.align 2
271regular_func divmod_s64s64_unsafe
272 cmp r3,#0
273 blt 1f
274@ here x +ve
275 beq 2f                    @ could x be zero?
2763:
277 cmp r1,#0
278 bge divmod_u64u64_unsafe  @ both positive
279@ y -ve, x +ve
280 push {r14}
281 dneg r0,r1
282 bl divmod_u64u64_unsafe
283 dneg r0,r1
284 dneg r2,r3
285 pop {r15}
286
2872:
288 cmp r2,#0
289 bne 3b                    @ back if x not zero
290
291 cmp r0,#0                 @ y==0?
292 bne 4f
293 cmp r1,#0
294 beq 5f                    @ then pass 0 to __aeabi_ldiv0
2954:
296 movs r0,#0
297 lsrs r1,#31
298 lsls r1,#31               @ get sign bit
299 bne 5f                    @ y -ve? pass -2^63 to __aeabi_ldiv0
300 mvns r0,r0
301 lsrs r1,r0,#1             @ y +ve: pass 2^63-1 to __aeabi_ldiv0
3025:
303 push {r14}
304#if PICO_DIVIDER_CALL_LDIV0
305 bl __aeabi_ldiv0
306#endif
307 movs r2,#0                @ and return 0 for the remainder
308 movs r3,#0
309 pop {r15}
310
3111:
312@ here x -ve
313 push {r14}
314 cmp r1,#0
315 blt 1f
316@ y +ve, x -ve
317 dneg r2,r3
318 bl divmod_u64u64_unsafe
319 dneg r0,r1
320 pop {r15}
321
3221:
323@ y -ve, x -ve
324 dneg r0,r1
325 dneg r2,r3
326 bl divmod_u64u64_unsafe
327 dneg r2,r3
328 pop {r15}
329
330regular_func divmod_u64u64_unsafe
331 cmp r1,#0
332 bne y64                   @ y fits in 32 bits?
333 cmp r3,#0                 @ yes; and x?
334 bne 1f
335 cmp r2,#0
336 beq 2f                    @ x==0?
337 mov r12,r7
338 ldr r7,=#SIO_BASE
339 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
340 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
341 movs r1,#0
342 movs r3,#0
343 wait_div 2
344 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET]
345 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]
346 mov r7,r12
347 bx r14
348
3492:                         @ divide by 0 with y<2^32
350 cmp r0,#0                 @ y==0?
351 beq 3f                    @ then pass 0 to __aeabi_ldiv0
352udiv0:
353 ldr r0,=#0xffffffff
354 movs r1,r0                @ pass 2^64-1 to __aeabi_ldiv0
3553:
356 push {r14}
357#if PICO_DIVIDER_CALL_LDIV0
358 bl __aeabi_ldiv0
359#endif
360 movs r2,#0                @ and return 0 for the remainder
361 movs r3,#0
362 pop {r15}
363
3641:
365 movs r2,r0                @ x>y, so result is 0 remainder y
366 movs r3,r1
367 movs r0,#0
368 movs r1,#0
369 bx r14
370
371.ltorg
372
373@ here y occupies more than 32 bits
374@ split into cases acccording to the size of x
375y64:
376 cmp r3,#0
377 beq 1f
378 b y64_x48                 @ if x does not fit in 32 bits, go to 48- and 64-bit cases
3791:
380 lsrs r3,r2,#16
381 bne y64_x32               @ jump if x is 17..32 bits
382
383@ here x is at most 16 bits
384
385 cmp r2,#0
386 beq udiv0                 @ x==0? exit as with y!=0 case above
387 push {r7}
388 ldr r7,=#SIO_BASE
389 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
390 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
391 wait_div 4
392 push {r4, r5}
393 lsrs r4,r0,#16
394 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x
395 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q0=y0/x;
396 lsls r3,#16
397 orrs r3,r4
398 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
399 wait_div 1
400 uxth r4,r0
401 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x
402 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q1=y1/x;
403 lsls r3,#16
404 orrs r3,r4
405 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
406 wait_div 3
407 movs r3,#0
408 lsls r4,r5,#16             @ quotient=(q0<<32)+(q1<<16)+q2
409 lsrs r5,#16
410 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x
411 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q2=y2/x;
412 adds r0,r4
413 adcs r1,r5
414 pop {r4,r5,r7}
415 bx r14
416
417.ltorg
418
419y64_x32:
420@ here x is 17..32 bits
421 push {r4-r7,r14}
422 mov r12,r2                @ save x
423 movs r5,#0                @ xsh=0
424 lsrs r4,r2,#24
425 bne 1f
426 lsls r2,#8                @ if(x0<1U<<24) x0<<=8,xsh =8;
427 adds r5,#8
4281:
429 lsrs r4,r2,#28
430 bne 1f
431 lsls r2,#4                @ if(x0<1U<<28) x0<<=4,xsh+=4;
432 adds r5,#4
4331:
434 lsrs r4,r2,#30
435 bne 1f
436 lsls r2,#2                @ if(x0<1U<<30) x0<<=2,xsh+=2;
437 adds r5,#2
4381:
439 lsrs r4,r2,#31
440 bne 1f
441 lsls r2,#1                @ if(x0<1U<<31) x0<<=1,xsh+=1;
442 adds r5,#1
4431:
444@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49
445 lsrs r4,r2,#15
446 adds r4,#1                @ x1=(x0>>15)+1; 2^16<x1<=2^17
447
448 ldr r7,=#SIO_BASE
449 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
450 ldr r4,=#0xffffffff
451 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
452 lsrs r6,r1,#16
453 uxth r3,r2                @ x0l
454 wait_div 2
455 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
456
457@ here
458@ r0:r1 y
459@ r2    x0
460@ r4    r
461@ r5    xsh
462@ r12   x
463
464 muls r6,r4
465 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
466 lsls r7,r6,#13
467 mov r14,r7                @ quh=q0<<13
468
469 muls r3,r6                @ x0l*q
470 lsrs r7,r3,#15
471 lsls r3,#17               @ r3:r7 is (x0l*q)<<17
472 subs r0,r3
473 sbcs r1,r7                @ y-=(x0l*q)<<17
474
475 lsrs r3,r2,#16            @ x0h
476 muls r3,r6                @ q*x0h
477 adds r3,r3
478 subs r1,r3                @ y-=(x0h*q)<<17
479
480 lsrs r6,r1,#3
481 muls r6,r4
482 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;
483 add r14,r6                @ quh+=q1
484
485 uxth r3,r2                @ x0l
486 muls r3,r6                @ x0l*q
487 lsrs r7,r3,#28
488 lsls r3,#4                @ r3:r7 is (x0l*q)<<4
489 subs r0,r3
490 sbcs r1,r7                @ y-=(x0l*q)<<4
491
492 lsrs r3,r2,#16            @ x0h
493 muls r3,r6                @ x0h*q
494 lsrs r7,r3,#12
495 lsls r3,#20               @ r3:r7 is (x0h*q)<<4
496 subs r0,r3
497 sbcs r1,r7                @ y-=(x0h*q)<<4
498
499 lsrs r6,r0,#22
500 lsls r7,r1,#10
501 orrs r6,r7                @ y>>22
502 muls r6,r4
503 lsrs r6,#16               @ q=((ui32)(y>>22)*r)>>16;
504
505 cmp r5,#9
506 blt last0                 @ if(xsh<9) goto last0;
507
508@ on this path xsh>=9, which means x<2^23
509 lsrs r2,#9                @ x0>>9: this shift loses no bits
510@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient
511@ bits (at most 6 bits) times x, and so fits in one word
512 muls r2,r6                @ x0*q
513 subs r0,r2                @ y-x0*q
514 lsls r7,r6,#13            @ qul=q<<13
5151:
516 lsrs r6,r0,#9
517 muls r6,r4
518 lsrs r6,#16               @ q=((ui32)(y>>9)*r)>>16;
519
520@ here
521@ r0 y
522@ r2 x0>>9
523@ r5 xsh
524@ r6 q
525@ r7 qul
526@ r12 x
527@ r14 quh
528
529 movs r3,#22
530 subs r3,r5                @ 22-xsh
531 lsrs r6,r3                @ q>>=22-xsh
532 lsrs r7,r3                @ qul>>=22-xsh
533 adds r7,r6                @ qul+=q
534 mov r4,r12
535 muls r6,r4                @ x*q
536 subs r2,r0,r6             @ y-=x*q
537 mov r0,r14                @ quh
538 adds r5,#4                @ xsh+4
539 adds r3,#6                @ 28-xsh
540 movs r1,r0
541 lsrs r1,r3
542 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
543 adds r0,r7
544 bcc 1f
5452:
546 adds r1,#1
5471:                         @ qu=((ui64)quh<<(4+xsh))+qul
548 cmp r2,r4
549 bhs 3f
550 movs r3,#0
551 pop {r4-r7,r15}
552
553.ltorg
554
5553:
556 subs r2,r4
557 adds r0,#1
558 bcc 1b
559 b 2b                      @ while(y>=x) y-=x,qu++;
560
561@ here:
562@ r0:r1 y
563@ r2 x0
564@ r4 r
565@ r5 xsh; xsh<9
566@ r6 q
567
568last0:
569 movs r7,#9
570 subs r7,r5                @ 9-xsh
571 lsrs r6,r7
572 mov r4,r12                @ x
573 uxth r2,r4
574 muls r2,r6                @ q*xlo
575 subs r0,r2
576 bcs 1f
577 subs r1,#1                @ y-=q*xlo
5781:
579 lsrs r2,r4,#16            @ xhi
580 muls r2,r6                @ q*xhi
581 lsrs r3,r2,#16
582 lsls r2,#16
583 subs r2,r0,r2
584 sbcs r1,r3                @ y-q*xhi
585 movs r3,r1                @ y now in r2:r3
586 mov r0,r14                @ quh
587 adds r5,#4                @ xsh+4
588 adds r7,#19               @ 28-xsh
589 movs r1,r0
590 lsrs r1,r7
591 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
592 adds r0,r6
593 bcc 1f
594 adds r1,#1                @ quh<<(xsh+4))+q
5951:
596 cmp r3,#0                 @ y>=2^32?
597 bne 3f
598 cmp r2,r4                 @ y>=x?
599 bhs 4f
600 pop {r4-r7,r15}
601
6023:
603 adds r0,#1                @ qu++
604 bcc 2f
605 adds r1,#1
6062:
607 subs r2,r4                @ y-=x
608 bcs 3b
609 subs r3,#1
610 bne 3b
611
6121:
613 cmp r2,r4
614 bhs 4f
615 pop {r4-r7,r15}
616
6174:
618 adds r0,#1                @ qu++
619 bcc 2f
620 adds r1,#1
6212:
622 subs r2,r4                @ y-=x
623 b 1b
624
625y64_x48:
626@ here x is 33..64 bits
627 push {r4-r7,r14}          @ save a copy of x
628 lsrs r4,r3,#16
629 beq 1f
630 b y64_x64                 @ jump if x is 49..64 bits
6311:
632 push {r2-r3}              @ save a copy of x
633@ here x is 33..48 bits
634 movs r5,#0                @ xsh=0
635 lsrs r4,r3,#8
636 bne 1f
637 lsls r3,#8
638 lsrs r6,r2,#24
639 orrs r3,r6
640 lsls r2,#8                @ if(x0<1U<<40) x0<<=8,xsh =8;
641 adds r5,#8
6421:
643 lsrs r4,r3,#12
644 bne 1f
645 lsls r3,#4
646 lsrs r6,r2,#28
647 orrs r3,r6
648 lsls r2,#4                @ if(x0<1U<<44) x0<<=4,xsh+=4;
649 adds r5,#4
6501:
651 lsrs r4,r3,#14
652 bne 1f
653 lsls r3,#2
654 lsrs r6,r2,#30
655 orrs r3,r6
656 lsls r2,#2                @ if(x0<1U<<46) x0<<=2,xsh+=2;
657 adds r5,#2
6581:
659 lsrs r4,r3,#15
660 bne 1f
661 adds r2,r2
662 adcs r3,r3                @ if(x0<1U<<47) x0<<=1,xsh+=1;
663 adds r5,#1
6641:
665@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33
666 movs r4,r3
667 adds r7,r2,r2
668 adcs r4,r4
669 adds r4,#1                @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17
670
671 ldr r7,=#SIO_BASE
672 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
673 ldr r4,=#0xffffffff
674 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
675 lsrs r6,r1,#16
676 wait_div 1
677 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
678
679@ here
680@ r0:r1 y
681@ r2:r3 x0
682@ r4    r
683@ r5    xsh 0<=xsh<16
684
685 muls r6,r4
686 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
687 lsls r7,r6,#13
688 mov r14,r7                @ save q<<13
689 uxth r7,r2                @ x0l
690 muls r7,r6
691 subs r0,r7
692 bcs 1f
693 subs r1,#1
6941:
695 subs r0,r7
696 bcs 1f
697 subs r1,#1
6981:
699 uxth r7,r3                @ x0h
700 muls r7,r6
701 subs r1,r7
702 subs r1,r7
703 lsrs r7,r2,#16            @ x0m
704 muls r7,r6
705 lsls r6,r7,#17
706 lsrs r7,#15
707 subs r0,r6
708 sbcs r1,r7                @ y-=((ui64)q*x0)<<1;
709
710 lsrs r6,r1,#3             @ y>>35
711 muls r6,r4
712 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;
713
714 cmp r5,#12
715 blt last1                 @ if(xsh<12) goto last1;
716
717 add r14,r6                @ qu<<13+q
718 lsrs r2,#12
719 lsls r7,r3,#20
720 orrs r2,r7
721 lsrs r3,#12               @ x0>>12
722
723 uxth r7,r2                @ x0l
724 muls r7,r6
725 subs r0,r7
726 bcs 1f
727 subs r1,#1
7281:
729 uxth r7,r3                @ x0h
730 muls r7,r6
731 subs r1,r7
732 lsrs r7,r2,#16            @ x0m
733 muls r7,r6
734 lsls r6,r7,#16
735 lsrs r7,#16
736 subs r0,r6
737 sbcs r1,r7                @ y-=((ui64)q*x0)>>12
738
739 lsrs r6,r0,#22
740 lsls r7,r1,#10
741 orrs r6,r7                @ y>>22
742 muls r6,r4
743 movs r7,#41
744 subs r7,r5
745 lsrs r6,r7                @ q=((ui32)(y>>22)*r)>>(16+25-xsh)
746
747 subs r5,#12
748 mov r7,r14
749 lsls r7,r5
7502:
751 adds r7,r6                @ qu=(qu<<(xsh-12))+q
752 pop {r4,r5}               @ recall x
753
754@ here
755@ r0:r1 y
756@ r4:r5 x
757@ r6 q
758@ r7 qu
759
760 uxth r2,r4
761 uxth r3,r5
762 muls r2,r6                @ xlo*q
763 muls r3,r6                @ xhi*q
764 subs r0,r2
765 sbcs r1,r3
766 lsrs r2,r4,#16
767 muls r2,r6
768 lsrs r3,r2,#16
769 lsls r2,#16               @ xm*q
770 subs r0,r2
771 sbcs r1,r3                @ y-=(ui64)q*x
772
7731:
774 movs r2,r0
775 movs r3,r1
776 adds r7,#1
777 subs r0,r4
778 sbcs r1,r5                @ while(y>=x) y-=x,qu++;
779 bhs 1b
780 subs r0,r7,#1             @ correction to qu
781 movs r1,#0
782 pop {r4-r7,r15}
783
784last1:
785@ r0:r1 y
786@ r2:r3 x0
787@ r5 xsh
788@ r6 q
789
790 movs r7,#12
791 subs r7,r5
792 lsrs r6,r7                @ q>>=12-xsh
793 mov r7,r14
794 lsrs r7,#13
795 lsls r7,r5
796 adds r7,r7                @ qu<<(xsh+1)
797 b 2b
798
799y64_x64:
800@ here x is 49..64 bits
801 movs r4,#0                @ q=0 if x>>32==0xffffffff
802 adds r5,r3,#1
803 beq 1f
804
805 ldr r7,=#SIO_BASE
806 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET]
807 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
808 wait_div 0
809 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1)
8101:
811 uxth r5,r2
812 uxth r6,r3
813 muls r5,r4
814 muls r6,r4
815 subs r0,r5
816 sbcs r1,r6
817 lsrs r5,r2,#16
818 lsrs r6,r3,#16
819 muls r5,r4
820 muls r6,r4
821 lsls r6,#16
822 lsrs r7,r5,#16
823 orrs r6,r7
824 lsls r5,#16
825 subs r0,r5
826 sbcs r1,r6                @   y-=(ui64)q*x
827
828 cmp r1,r3                 @   while(y>=x) y-=x,q++
829 bhs 1f
8303:
831 movs r2,r0
832 movs r3,r1
833 movs r0,r4
834 movs r1,#0
835 pop {r4-r7,r15}
836
8371:
838 bne 2f
839 cmp r0,r2
840 blo 3b
8412:
842 subs r0,r2
843 sbcs r1,r3
844 adds r4,#1
845 cmp r1,r3
846 blo 3b
847 b 1b
848
849div_section divmod_s64s64_rem
850regular_func divmod_s64s64_rem
851    push {r4, lr}
852    bl divmod_s64s64
853    ldr r4, [sp, #8]
854    stmia r4!, {r2,r3}
855    pop {r4, pc}
856
857div_section divmod_u64u64_rem
858regular_func divmod_u64u64_rem
859    push {r4, lr}
860    bl divmod_u64u64
861    ldr r4, [sp, #8]
862    stmia r4!, {r2,r3}
863    pop {r4, pc}
864