rp2_common/pico_divider/divider.S

/*
 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "hardware/regs/sio.h"
#include "hardware/regs/addressmap.h"

.syntax unified
.cpu cortex-m0plus
.thumb

#include "pico/asm_helper.S"

#ifndef PICO_DIVIDER_CALL_IDIV0
#define PICO_DIVIDER_CALL_IDIV0 1
#endif

#ifndef PICO_DIVIDER_CALL_LDIV0
#define PICO_DIVIDER_CALL_LDIV0 1
#endif

.macro div_section name
#if PICO_DIVIDER_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

#if SIO_DIV_CSR_READY_LSB == 0
.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
#else
need to change SHIFT above
#endif
#if SIO_DIV_CSR_DIRTY_LSB == 1
.equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
#else
need to change SHIFT above
#endif

@ wait 8-n cycles for the hardware divider
.macro wait_div n
.rept (8-\n) / 2
  b 9f
9:
.endr
.if (8-\n) % 2
 nop
.endif
.endm


#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
#error register layout has changed - we rely on this order to make sure we save/restore in the right order
#endif

# SIO_BASE ptr in r2
.macro save_div_state_and_lr
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    # wait for results as we can't save signed-ness of operation
1:
    lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
    bcc 1b
    push {r4, r5, r6, r7, lr}
    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
    ldr r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
    ldr r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
    ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
    ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
.endm

.macro restore_div_state_and_return
    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
    //
    // it is worth considering what happens if we are interrupted
    //
    // after writing r4: we are DIRTY and !READY
    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
    //        saved/restored correctly and we'll restore the rest ourselves
    // after writing r4, r5: we are DIRTY and !READY
    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
    // after writing r4, r5, r6: we are DIRTY and READY
    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
    //        and we'll restore the remainder after the fact

    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
    // and so 4 reads is cheaper (and we don't have to adjust r2)
    str r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
    str r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
    pop {r4, r5, r6, r7, pc}
.endm

.macro save_div_state_and_lr_64
    push {r4, r5, r6, r7, lr}
    ldr r6, =SIO_BASE
1:
    ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
    # wait for results as we can't save signed-ness of operation
    lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
    bcc 1b
    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
    ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
    ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
    ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
    ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
.endm

.macro restore_div_state_and_return_64
    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
    //
    // it is worth considering what happens if we are interrupted
    //
    // after writing r4: we are DIRTY and !READY
    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
    //        saved/restored correctly and we'll restore the rest ourselves
    // after writing r4, r5: we are DIRTY and !READY
    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
    // after writing r4, r5, r6: we are DIRTY and READY
    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
    //        and we'll restore the remainder after the fact

    mov ip, r2
    ldr r2, =SIO_BASE
    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
    // and so 4 reads is cheaper (and we don't have to adjust r2)
    str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
    str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
    mov r2, ip
    pop {r4, r5, r6, r7, pc}
.endm


// since idiv and idivmod only differ by a cycle, we'll make them the same!
div_section WRAPPER_FUNC_NAME(__aeabi_idiv)
.align 2
wrapper_func __aeabi_idiv
wrapper_func __aeabi_idivmod
regular_func div_s32s32
regular_func divmod_s32s32
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs divmod_s32s32_savestate
regular_func divmod_s32s32_unsafe
    str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
    str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET]
    cmp r1, #0
    beq 1f
    wait_div 2
    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
    bx lr
1:
    push {r2, lr}
    movs r1, #0x80
    lsls r1, #24
    asrs r2, r0, #31
    eors r1, r2
    cmp r0, #0
    beq 1f
    mvns r0, r1
1:
#if PICO_DIVIDER_CALL_IDIV0
    bl __aeabi_idiv0
#endif
    movs r1, #0 // remainder 0
    // need to restore saved r2 as it hold SIO ptr
    pop {r2, pc}
.align 2
regular_func divmod_s32s32_savestate
    save_div_state_and_lr
    bl divmod_s32s32_unsafe
    restore_div_state_and_return

// since uidiv and uidivmod only differ by a cycle, we'll make them the same!
div_section WRAPPER_FUNC_NAME(__aeabi_uidiv)
regular_func div_u32u32
regular_func divmod_u32u32
wrapper_func __aeabi_uidiv
wrapper_func __aeabi_uidivmod
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs divmod_u32u32_savestate
regular_func divmod_u32u32_unsafe
    str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
    str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET]
    cmp r1, #0
    beq 1f
    wait_div 2
    // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
    ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
    ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
    bx lr
1:
    push {r2, lr}
    cmp r0, #0
    beq 1f
    movs r0, #0
    mvns r0, r0
1:
#if PICO_DIVIDER_CALL_IDIV0
    bl __aeabi_idiv0
#endif
    movs r1, #0 // remainder 0
    // need to restore saved r2 as it hold SIO ptr
    pop {r2, pc}
.align 2
regular_func divmod_u32u32_savestate
    save_div_state_and_lr
    bl divmod_u32u32_unsafe
    restore_div_state_and_return

div_section WRAPPER_FUNC_NAME(__aeabi_ldiv)

.align 2
wrapper_func __aeabi_ldivmod
regular_func div_s64s64
regular_func divmod_s64s64
    mov ip, r2
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    mov r2, ip
    bcs divmod_s64s64_savestate
    b divmod_s64s64_unsafe
.align 2
divmod_s64s64_savestate:
    save_div_state_and_lr_64
    bl divmod_s64s64_unsafe
    restore_div_state_and_return_64

.align 2
wrapper_func __aeabi_uldivmod
regular_func div_u64u64
regular_func divmod_u64u64
    mov ip, r2
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    mov r2, ip
    bcs divmod_u64u64_savestate
    b divmod_u64u64_unsafe
.align 2
regular_func divmod_u64u64_savestate
    save_div_state_and_lr_64
    bl divmod_u64u64_unsafe
    restore_div_state_and_return_64
.macro dneg lo,hi
 mvns \hi,\hi
 rsbs \lo,#0
 bne l\@_1
 adds \hi,#1
l\@_1:
.endm

.align 2
regular_func divmod_s64s64_unsafe
 cmp r3,#0
 blt 1f
@ here x +ve
 beq 2f                    @ could x be zero?
3:
 cmp r1,#0
 bge divmod_u64u64_unsafe  @ both positive
@ y -ve, x +ve
 push {r14}
 dneg r0,r1
 bl divmod_u64u64_unsafe
 dneg r0,r1
 dneg r2,r3
 pop {r15}

2:
 cmp r2,#0
 bne 3b                    @ back if x not zero

 cmp r0,#0                 @ y==0?
 bne 4f
 cmp r1,#0
 beq 5f                    @ then pass 0 to __aeabi_ldiv0
4:
 movs r0,#0
 lsrs r1,#31
 lsls r1,#31               @ get sign bit
 bne 5f                    @ y -ve? pass -2^63 to __aeabi_ldiv0
 mvns r0,r0
 lsrs r1,r0,#1             @ y +ve: pass 2^63-1 to __aeabi_ldiv0
5:
 push {r14}
#if PICO_DIVIDER_CALL_LDIV0
 bl __aeabi_ldiv0
#endif
 movs r2,#0                @ and return 0 for the remainder
 movs r3,#0
 pop {r15}

1:
@ here x -ve
 push {r14}
 cmp r1,#0
 blt 1f
@ y +ve, x -ve
 dneg r2,r3
 bl divmod_u64u64_unsafe
 dneg r0,r1
 pop {r15}

1:
@ y -ve, x -ve
 dneg r0,r1
 dneg r2,r3
 bl divmod_u64u64_unsafe
 dneg r2,r3
 pop {r15}

regular_func divmod_u64u64_unsafe
 cmp r1,#0
 bne y64                   @ y fits in 32 bits?
 cmp r3,#0                 @ yes; and x?
 bne 1f
 cmp r2,#0
 beq 2f                    @ x==0?
 mov r12,r7
 ldr r7,=#SIO_BASE
 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
 movs r1,#0
 movs r3,#0
 wait_div 2
 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET]
 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]
 mov r7,r12
 bx r14

2:                         @ divide by 0 with y<2^32
 cmp r0,#0                 @ y==0?
 beq 3f                    @ then pass 0 to __aeabi_ldiv0
udiv0:
 ldr r0,=#0xffffffff
 movs r1,r0                @ pass 2^64-1 to __aeabi_ldiv0
3:
 push {r14}
#if PICO_DIVIDER_CALL_LDIV0
 bl __aeabi_ldiv0
#endif
 movs r2,#0                @ and return 0 for the remainder
 movs r3,#0
 pop {r15}

1:
 movs r2,r0                @ x>y, so result is 0 remainder y
 movs r3,r1
 movs r0,#0
 movs r1,#0
 bx r14

.ltorg

@ here y occupies more than 32 bits
@ split into cases acccording to the size of x
y64:
 cmp r3,#0
 beq 1f
 b y64_x48                 @ if x does not fit in 32 bits, go to 48- and 64-bit cases
1:
 lsrs r3,r2,#16
 bne y64_x32               @ jump if x is 17..32 bits

@ here x is at most 16 bits

 cmp r2,#0
 beq udiv0                 @ x==0? exit as with y!=0 case above
 push {r7}
 ldr r7,=#SIO_BASE
 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
 wait_div 4
 push {r4, r5}
 lsrs r4,r0,#16
 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x
 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q0=y0/x;
 lsls r3,#16
 orrs r3,r4
 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
 wait_div 1
 uxth r4,r0
 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x
 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q1=y1/x;
 lsls r3,#16
 orrs r3,r4
 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
 wait_div 3
 movs r3,#0
 lsls r4,r5,#16             @ quotient=(q0<<32)+(q1<<16)+q2
 lsrs r5,#16
 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x
 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ q2=y2/x;
 adds r0,r4
 adcs r1,r5
 pop {r4,r5,r7}
 bx r14

.ltorg

y64_x32:
@ here x is 17..32 bits
 push {r4-r7,r14}
 mov r12,r2                @ save x
 movs r5,#0                @ xsh=0
 lsrs r4,r2,#24
 bne 1f
 lsls r2,#8                @ if(x0<1U<<24) x0<<=8,xsh =8;
 adds r5,#8
1:
 lsrs r4,r2,#28
 bne 1f
 lsls r2,#4                @ if(x0<1U<<28) x0<<=4,xsh+=4;
 adds r5,#4
1:
 lsrs r4,r2,#30
 bne 1f
 lsls r2,#2                @ if(x0<1U<<30) x0<<=2,xsh+=2;
 adds r5,#2
1:
 lsrs r4,r2,#31
 bne 1f
 lsls r2,#1                @ if(x0<1U<<31) x0<<=1,xsh+=1;
 adds r5,#1
1:
@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49
 lsrs r4,r2,#15
 adds r4,#1                @ x1=(x0>>15)+1; 2^16<x1<=2^17

 ldr r7,=#SIO_BASE
 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
 ldr r4,=#0xffffffff
 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
 lsrs r6,r1,#16
 uxth r3,r2                @ x0l
 wait_div 2
 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate

@ here
@ r0:r1 y
@ r2    x0
@ r4    r
@ r5    xsh
@ r12   x

 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
 lsls r7,r6,#13
 mov r14,r7                @ quh=q0<<13

 muls r3,r6                @ x0l*q
 lsrs r7,r3,#15
 lsls r3,#17               @ r3:r7 is (x0l*q)<<17
 subs r0,r3
 sbcs r1,r7                @ y-=(x0l*q)<<17

 lsrs r3,r2,#16            @ x0h
 muls r3,r6                @ q*x0h
 adds r3,r3
 subs r1,r3                @ y-=(x0h*q)<<17

 lsrs r6,r1,#3
 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;
 add r14,r6                @ quh+=q1

 uxth r3,r2                @ x0l
 muls r3,r6                @ x0l*q
 lsrs r7,r3,#28
 lsls r3,#4                @ r3:r7 is (x0l*q)<<4
 subs r0,r3
 sbcs r1,r7                @ y-=(x0l*q)<<4

 lsrs r3,r2,#16            @ x0h
 muls r3,r6                @ x0h*q
 lsrs r7,r3,#12
 lsls r3,#20               @ r3:r7 is (x0h*q)<<4
 subs r0,r3
 sbcs r1,r7                @ y-=(x0h*q)<<4

 lsrs r6,r0,#22
 lsls r7,r1,#10
 orrs r6,r7                @ y>>22
 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>22)*r)>>16;

 cmp r5,#9
 blt last0                 @ if(xsh<9) goto last0;

@ on this path xsh>=9, which means x<2^23
 lsrs r2,#9                @ x0>>9: this shift loses no bits
@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient
@ bits (at most 6 bits) times x, and so fits in one word
 muls r2,r6                @ x0*q
 subs r0,r2                @ y-x0*q
 lsls r7,r6,#13            @ qul=q<<13
1:
 lsrs r6,r0,#9
 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>9)*r)>>16;

@ here
@ r0 y
@ r2 x0>>9
@ r5 xsh
@ r6 q
@ r7 qul
@ r12 x
@ r14 quh

 movs r3,#22
 subs r3,r5                @ 22-xsh
 lsrs r6,r3                @ q>>=22-xsh
 lsrs r7,r3                @ qul>>=22-xsh
 adds r7,r6                @ qul+=q
 mov r4,r12
 muls r6,r4                @ x*q
 subs r2,r0,r6             @ y-=x*q
 mov r0,r14                @ quh
 adds r5,#4                @ xsh+4
 adds r3,#6                @ 28-xsh
 movs r1,r0
 lsrs r1,r3
 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
 adds r0,r7
 bcc 1f
2:
 adds r1,#1
1:                         @ qu=((ui64)quh<<(4+xsh))+qul
 cmp r2,r4
 bhs 3f
 movs r3,#0
 pop {r4-r7,r15}

.ltorg

3:
 subs r2,r4
 adds r0,#1
 bcc 1b
 b 2b                      @ while(y>=x) y-=x,qu++;

@ here:
@ r0:r1 y
@ r2 x0
@ r4 r
@ r5 xsh; xsh<9
@ r6 q

last0:
 movs r7,#9
 subs r7,r5                @ 9-xsh
 lsrs r6,r7
 mov r4,r12                @ x
 uxth r2,r4
 muls r2,r6                @ q*xlo
 subs r0,r2
 bcs 1f
 subs r1,#1                @ y-=q*xlo
1:
 lsrs r2,r4,#16            @ xhi
 muls r2,r6                @ q*xhi
 lsrs r3,r2,#16
 lsls r2,#16
 subs r2,r0,r2
 sbcs r1,r3                @ y-q*xhi
 movs r3,r1                @ y now in r2:r3
 mov r0,r14                @ quh
 adds r5,#4                @ xsh+4
 adds r7,#19               @ 28-xsh
 movs r1,r0
 lsrs r1,r7
 lsls r0,r5                @ r0:r1 is quh<<(4+xsh)
 adds r0,r6
 bcc 1f
 adds r1,#1                @ quh<<(xsh+4))+q
1:
 cmp r3,#0                 @ y>=2^32?
 bne 3f
 cmp r2,r4                 @ y>=x?
 bhs 4f
 pop {r4-r7,r15}

3:
 adds r0,#1                @ qu++
 bcc 2f
 adds r1,#1
2:
 subs r2,r4                @ y-=x
 bcs 3b
 subs r3,#1
 bne 3b

1:
 cmp r2,r4
 bhs 4f
 pop {r4-r7,r15}

4:
 adds r0,#1                @ qu++
 bcc 2f
 adds r1,#1
2:
 subs r2,r4                @ y-=x
 b 1b

y64_x48:
@ here x is 33..64 bits
 push {r4-r7,r14}          @ save a copy of x
 lsrs r4,r3,#16
 beq 1f
 b y64_x64                 @ jump if x is 49..64 bits
1:
 push {r2-r3}              @ save a copy of x
@ here x is 33..48 bits
 movs r5,#0                @ xsh=0
 lsrs r4,r3,#8
 bne 1f
 lsls r3,#8
 lsrs r6,r2,#24
 orrs r3,r6
 lsls r2,#8                @ if(x0<1U<<40) x0<<=8,xsh =8;
 adds r5,#8
1:
 lsrs r4,r3,#12
 bne 1f
 lsls r3,#4
 lsrs r6,r2,#28
 orrs r3,r6
 lsls r2,#4                @ if(x0<1U<<44) x0<<=4,xsh+=4;
 adds r5,#4
1:
 lsrs r4,r3,#14
 bne 1f
 lsls r3,#2
 lsrs r6,r2,#30
 orrs r3,r6
 lsls r2,#2                @ if(x0<1U<<46) x0<<=2,xsh+=2;
 adds r5,#2
1:
 lsrs r4,r3,#15
 bne 1f
 adds r2,r2
 adcs r3,r3                @ if(x0<1U<<47) x0<<=1,xsh+=1;
 adds r5,#1
1:
@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33
 movs r4,r3
 adds r7,r2,r2
 adcs r4,r4
 adds r4,#1                @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17

 ldr r7,=#SIO_BASE
 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
 ldr r4,=#0xffffffff
 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
 lsrs r6,r1,#16
 wait_div 1
 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET]  @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate

@ here
@ r0:r1 y
@ r2:r3 x0
@ r4    r
@ r5    xsh 0<=xsh<16

 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>48)*r)>>16;
 lsls r7,r6,#13
 mov r14,r7                @ save q<<13
 uxth r7,r2                @ x0l
 muls r7,r6
 subs r0,r7
 bcs 1f
 subs r1,#1
1:
 subs r0,r7
 bcs 1f
 subs r1,#1
1:
 uxth r7,r3                @ x0h
 muls r7,r6
 subs r1,r7
 subs r1,r7
 lsrs r7,r2,#16            @ x0m
 muls r7,r6
 lsls r6,r7,#17
 lsrs r7,#15
 subs r0,r6
 sbcs r1,r7                @ y-=((ui64)q*x0)<<1;

 lsrs r6,r1,#3             @ y>>35
 muls r6,r4
 lsrs r6,#16               @ q=((ui32)(y>>35)*r)>>16;

 cmp r5,#12
 blt last1                 @ if(xsh<12) goto last1;

 add r14,r6                @ qu<<13+q
 lsrs r2,#12
 lsls r7,r3,#20
 orrs r2,r7
 lsrs r3,#12               @ x0>>12

 uxth r7,r2                @ x0l
 muls r7,r6
 subs r0,r7
 bcs 1f
 subs r1,#1
1:
 uxth r7,r3                @ x0h
 muls r7,r6
 subs r1,r7
 lsrs r7,r2,#16            @ x0m
 muls r7,r6
 lsls r6,r7,#16
 lsrs r7,#16
 subs r0,r6
 sbcs r1,r7                @ y-=((ui64)q*x0)>>12

 lsrs r6,r0,#22
 lsls r7,r1,#10
 orrs r6,r7                @ y>>22
 muls r6,r4
 movs r7,#41
 subs r7,r5
 lsrs r6,r7                @ q=((ui32)(y>>22)*r)>>(16+25-xsh)

 subs r5,#12
 mov r7,r14
 lsls r7,r5
2:
 adds r7,r6                @ qu=(qu<<(xsh-12))+q
 pop {r4,r5}               @ recall x

@ here
@ r0:r1 y
@ r4:r5 x
@ r6 q
@ r7 qu

 uxth r2,r4
 uxth r3,r5
 muls r2,r6                @ xlo*q
 muls r3,r6                @ xhi*q
 subs r0,r2
 sbcs r1,r3
 lsrs r2,r4,#16
 muls r2,r6
 lsrs r3,r2,#16
 lsls r2,#16               @ xm*q
 subs r0,r2
 sbcs r1,r3                @ y-=(ui64)q*x

1:
 movs r2,r0
 movs r3,r1
 adds r7,#1
 subs r0,r4
 sbcs r1,r5                @ while(y>=x) y-=x,qu++;
 bhs 1b
 subs r0,r7,#1             @ correction to qu
 movs r1,#0
 pop {r4-r7,r15}

last1:
@ r0:r1 y
@ r2:r3 x0
@ r5 xsh
@ r6 q

 movs r7,#12
 subs r7,r5
 lsrs r6,r7                @ q>>=12-xsh
 mov r7,r14
 lsrs r7,#13
 lsls r7,r5
 adds r7,r7                @ qu<<(xsh+1)
 b 2b

y64_x64:
@ here x is 49..64 bits
 movs r4,#0                @ q=0 if x>>32==0xffffffff
 adds r5,r3,#1
 beq 1f

 ldr r7,=#SIO_BASE
 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET]
 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
 wait_div 0
 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1)
1:
 uxth r5,r2
 uxth r6,r3
 muls r5,r4
 muls r6,r4
 subs r0,r5
 sbcs r1,r6
 lsrs r5,r2,#16
 lsrs r6,r3,#16
 muls r5,r4
 muls r6,r4
 lsls r6,#16
 lsrs r7,r5,#16
 orrs r6,r7
 lsls r5,#16
 subs r0,r5
 sbcs r1,r6                @   y-=(ui64)q*x

 cmp r1,r3                 @   while(y>=x) y-=x,q++
 bhs 1f
3:
 movs r2,r0
 movs r3,r1
 movs r0,r4
 movs r1,#0
 pop {r4-r7,r15}

1:
 bne 2f
 cmp r0,r2
 blo 3b
2:
 subs r0,r2
 sbcs r1,r3
 adds r4,#1
 cmp r1,r3
 blo 3b
 b 1b

div_section divmod_s64s64_rem
regular_func divmod_s64s64_rem
    push {r4, lr}
    bl divmod_s64s64
    ldr r4, [sp, #8]
    stmia r4!, {r2,r3}
    pop {r4, pc}

div_section divmod_u64u64_rem
regular_func divmod_u64u64_rem
    push {r4, lr}
    bl divmod_u64u64
    ldr r4, [sp, #8]
    stmia r4!, {r2,r3}
    pop {r4, pc}